Merge branch 'shader-file-reorg'

1. Move all GL entrypoint functions and files into src/mesa/main/
This includes the ARB vp/vp, NV vp/fp, ATI fragshader and GLSL bits
that were in src/mesa/shader/

2. Move src/mesa/shader/slang/ to src/mesa/slang/ to reduce the tree depth

3. Rename src/mesa/shader/ to src/mesa/program/ since all the
remaining files are concerned with GPU programs.

4. Misc code refactoring.  In particular, I got rid of most of the
GLSL-related ctx->Driver hook functions.  None of the drivers used
them.

Conflicts:
	src/mesa/drivers/dri/i965/brw_context.c
diff --git a/configs/autoconf.in b/configs/autoconf.in
index 3428e3a..417138b 100644
--- a/configs/autoconf.in
+++ b/configs/autoconf.in
@@ -97,7 +97,7 @@
 
 # Driver specific build vars
 DRI_DIRS = @DRI_DIRS@ 
-EGL_DISPLAYS = @EGL_DISPLAYS@
+EGL_PLATFORMS = @EGL_PLATFORMS@
 EGL_CLIENT_APIS = @EGL_CLIENT_APIS@
 
 # Dependencies
diff --git a/configs/default b/configs/default
index 9241e80..3d97444 100644
--- a/configs/default
+++ b/configs/default
@@ -110,8 +110,8 @@
 GALLIUM_TARGET_DIRS = libgl-xlib
 GALLIUM_STATE_TRACKERS_DIRS = glx vega
 
-# native displays EGL should support
-EGL_DISPLAYS = x11
+# native platforms EGL should support
+EGL_PLATFORMS = x11
 EGL_CLIENT_APIS = $(GL_LIB)
 
 # Library dependencies
diff --git a/configure.ac b/configure.ac
index 1056fa1..5cfdd45 100644
--- a/configure.ac
+++ b/configure.ac
@@ -474,7 +474,7 @@
 GALLIUM_DIRS="auxiliary drivers state_trackers"
 GALLIUM_TARGET_DIRS=""
 GALLIUM_WINSYS_DIRS="sw"
-GALLIUM_DRIVERS_DIRS="softpipe failover trace rbug identity"
+GALLIUM_DRIVERS_DIRS="softpipe failover galahad trace rbug identity"
 GALLIUM_STATE_TRACKERS_DIRS=""
 
 case "$mesa_driver" in
@@ -860,7 +860,7 @@
 
 case $DRI_DIRS in
 *i915*|*i965*)
-    PKG_CHECK_MODULES([INTEL], [libdrm_intel >= 2.4.19])
+    PKG_CHECK_MODULES([INTEL], [libdrm_intel >= 2.4.21])
     ;;
 esac
 
@@ -891,7 +891,7 @@
 fi
 AC_ARG_ENABLE([gl-osmesa],
     [AS_HELP_STRING([--enable-gl-osmesa],
-        [enable OSMesa on libGL @<:@default=enabled for xlib driver@:>@])],
+        [enable OSMesa with libGL @<:@default=enabled for xlib driver@:>@])],
     [gl_osmesa="$enableval"],
     [gl_osmesa="$default_gl_osmesa"])
 if test "x$gl_osmesa" = xyes; then
@@ -926,8 +926,8 @@
 esac
 AC_SUBST([OSMESA_LIB])
 
-case "$mesa_driver" in
-osmesa)
+case "$DRIVER_DIRS" in
+*osmesa*)
     # only link libraries with osmesa if shared
     if test "$enable_static" = no; then
         OSMESA_LIB_DEPS="-lm -lpthread $SELINUX_LIBS $DLOPEN_LIBS"
@@ -937,19 +937,7 @@
     OSMESA_MESA_DEPS=""
     OSMESA_PC_LIB_PRIV="-lm -lpthread $SELINUX_LIBS $DLOPEN_LIBS"
     ;;
-*)
-    # Link OSMesa to libGL otherwise
-    OSMESA_LIB_DEPS=""
-    # only link libraries with osmesa if shared
-    if test "$enable_static" = no; then
-        OSMESA_MESA_DEPS='-l$(GL_LIB)'
-    else
-        OSMESA_MESA_DEPS=""
-    fi
-    OSMESA_PC_REQ="gl"
-    ;;
 esac
-OSMESA_PC_LIB_PRIV="$OSMESA_PC_LIB_PRIV"
 AC_SUBST([OSMESA_LIB_DEPS])
 AC_SUBST([OSMESA_MESA_DEPS])
 AC_SUBST([OSMESA_PC_REQ])
@@ -1313,18 +1301,22 @@
         HAVE_XEXTPROTO_71="no")
 fi
 
+AC_ARG_WITH([egl-platforms],
+    [AS_HELP_STRING([--with-egl-platforms@<:@=DIRS...@:>@],
+        [comma delimited native platforms libEGL supports, e.g.
+        "x11,kms" @<:@default=auto@:>@])],
+    [with_egl_platforms="$withval"],
+    [with_egl_platforms=yes])
 AC_ARG_WITH([egl-displays],
     [AS_HELP_STRING([--with-egl-displays@<:@=DIRS...@:>@],
-        [comma delimited native displays libEGL supports, e.g.
-        "x11,kms" @<:@default=auto@:>@])],
-    [with_egl_displays="$withval"],
-    [with_egl_displays=yes])
+        [DEPRECATED.  Use --with-egl-platforms instead])],
+    [with_egl_platforms="$withval"])
 
-EGL_DISPLAYS=""
-case "$with_egl_displays" in
+EGL_PLATFORMS=""
+case "$with_egl_platforms" in
 yes)
     if test "x$enable_egl" = xyes && test "x$mesa_driver" != xosmesa; then
-        EGL_DISPLAYS="x11"
+        EGL_PLATFORMS="x11"
     fi
     ;;
 *)
@@ -1332,15 +1324,18 @@
         AC_MSG_ERROR([cannot build egl state tracker without EGL library])
     fi
     # verify the requested driver directories exist
-    egl_displays=`IFS=', '; echo $with_egl_displays`
-    for dpy in $egl_displays; do
-        test -d "$srcdir/src/gallium/state_trackers/egl/$dpy" || \
-            AC_MSG_ERROR([EGL display '$dpy' does't exist])
+    egl_platforms=`IFS=', '; echo $with_egl_platforms`
+    for plat in $egl_platforms; do
+        test -d "$srcdir/src/gallium/state_trackers/egl/$plat" || \
+            AC_MSG_ERROR([EGL platform '$plat' does't exist])
+        if test "$plat" = "fbdev"; then
+                GALLIUM_WINSYS_DIRS="$GALLIUM_WINSYS_DIRS sw/fbdev"
+        fi
     done
-    EGL_DISPLAYS="$egl_displays"
+    EGL_PLATFORMS="$egl_platforms"
     ;;
 esac
-AC_SUBST([EGL_DISPLAYS])
+AC_SUBST([EGL_PLATFORMS])
 
 AC_ARG_WITH([egl-driver-dir],
     [AS_HELP_STRING([--with-egl-driver-dir=DIR],
@@ -1435,20 +1430,35 @@
 fi
 
 dnl
-dnl Gallium Intel configuration
+dnl Gallium i915 configuration
 dnl
-AC_ARG_ENABLE([gallium-intel],
-    [AS_HELP_STRING([--enable-gallium-intel],
-        [build gallium intel @<:@default=disabled@:>@])],
-    [enable_gallium_intel="$enableval"],
-    [enable_gallium_intel=auto])
-if test "x$enable_gallium_intel" = xyes; then
+AC_ARG_ENABLE([gallium-i915],
+    [AS_HELP_STRING([--enable-gallium-i915],
+        [build gallium i915 @<:@default=disabled@:>@])],
+    [enable_gallium_i915="$enableval"],
+    [enable_gallium_i915=auto])
+if test "x$enable_gallium_i915" = xyes; then
     GALLIUM_WINSYS_DIRS="$GALLIUM_WINSYS_DIRS i915/sw"
-    GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS i915 i965"
-    gallium_check_st "i915/drm i965/drm" "dri-i915 dri-i965" "egl-i915 egl-i965" "xorg-i915 xorg-i965"
-elif test "x$enable_gallium_intel" = xauto; then
+    GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS i915"
+    gallium_check_st "i915/drm" "dri-i915" "egl-i915" "xorg-i915"
+elif test "x$enable_gallium_i915" = xauto; then
     GALLIUM_WINSYS_DIRS="$GALLIUM_WINSYS_DIRS i915/sw"
-    GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS i915 i965"
+    GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS i915"
+fi
+
+dnl
+dnl Gallium i965 configuration
+dnl
+AC_ARG_ENABLE([gallium-i965],
+    [AS_HELP_STRING([--enable-gallium-i965],
+        [build gallium i965 @<:@default=disabled@:>@])],
+    [enable_gallium_i965="$enableval"],
+    [enable_gallium_i965=auto])
+if test "x$enable_gallium_i965" = xyes; then
+    GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS i965"
+    gallium_check_st "i965/drm" "dri-i965" "egl-i965" "xorg-i965"
+elif test "x$enable_gallium_i965" = xauto; then
+    GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS i965"
 fi
 
 dnl
@@ -1588,7 +1598,7 @@
 echo "        Static libs:     $enable_static"
 if test "$enable_egl" = yes; then
     echo "        EGL:             $EGL_DRIVERS_DIRS"
-    echo "        EGL displays:    $EGL_DISPLAYS"
+    echo "        EGL platforms:   $EGL_PLATFORMS"
 else
     echo "        EGL:             no"
 fi
diff --git a/docs/egl.html b/docs/egl.html
index ad3b850..b2198e9 100644
--- a/docs/egl.html
+++ b/docs/egl.html
@@ -69,14 +69,18 @@
 
 </li>
 
-<li><code>--with-egl-displays</code>
+<li><code>--with-egl-platforms</code>
 
-<p>List the window system(s) to support.  It is by default <code>x11</code>,
-which supports the X Window System.  Its argument is a comma separated string
-like, for example, <code>--with-egl-displays=x11,kms</code>.  Because an EGL
-driver decides which window system to support, this example will enable two
-(sets of) EGL drivers.  One supports the X window system and the other supports
-bare KMS (kernel modesetting).</p>
+<p>List the native platform window system(s) to support.  It is by default
+<code>x11</code>, which supports the X Window System.  Its argument is a comma
+separated string like, for example, <code>--with-egl-platforms=x11,kms</code>.
+Because an EGL driver decides which window system to support, this example will
+enable two (sets of) EGL drivers.  One supports the X window system and the
+other supports bare KMS (kernel modesetting).</p>
+
+<p>The available platforms are <code>x11</code>, <code>kms</code>,
+<code>fbdev</code>, and <code>gdi</code>.  The <code>gdi</code> platform can
+only be built with SCons.</p>
 
 </li>
 
@@ -118,16 +122,10 @@
 
 <h2>Use EGL</h2>
 
-<p>There are demos for the client APIs supported by EGL.  They can be found in
-<code>progs/egl/</code>, You can use them to test your build.  For example,</p>
+<h3>Demos</h3>
 
-<pre>
-  $ cd progs/egl/eglut
-  $ make
-  $ cd ../opengles1
-  $ make
-  $ ./torus_x11
-</pre>
+<p>There are demos for the client APIs supported by EGL.  They can be found in
+mesa/demos repository.</p>
 
 <h3>Environment Variables</h3>
 
@@ -153,14 +151,14 @@
 
 </li>
 
-<li><code>EGL_DISPLAY</code>
+<li><code>EGL_PLATFORM</code>
 
 <p>When <code>EGL_DRIVER</code> is not set, the main library loads <em>all</em>
-EGL drivers that support a certain window system.  <code>EGL_DISPLAY</code> can
-be used to specify the window system and the valid values are, for example,
+EGL drivers that support a certain window system.  <code>EGL_PLATFORM</code>
+can be used to specify the window system and the valid values are, for example,
 <code>x11</code> or <code>kms</code>.  When the variable is not set, the main
 library defaults the value to the first window system listed in
-<code>--with-egl-displays</code> at configuration time.
+<code>--with-egl-platforms</code> at configuration time.
 
 </li>
 
@@ -184,27 +182,27 @@
 
 <p>There are two categories of EGL drivers: Gallium and classic.</p>
 
-<p>Gallium EGL drivers supports all rendering APIs specified in EGL 1.4.  The
-support for optional EGL functions and EGL extensions is usually more complete
-than the classic ones.  These drivers depend on the <code>egl</code> state
-tracker to build.  The available drivers are</p>
+<p>Gallium EGL drivers supports all rendering APIs specified in EGL 1.4.  These
+drivers depend on the <code>egl</code> state tracker to build.  The available
+drivers are</p>
 
 <ul>
 <li><code>egl_&lt;dpy&gt;_i915</code></li>
 <li><code>egl_&lt;dpy&gt;_i965</code></li>
-<li><code>egl_&lt;dpy&gt;_radeon</code></li>
 <li><code>egl_&lt;dpy&gt;_nouveau</code></li>
+<li><code>egl_&lt;dpy&gt;_radeon</code></li>
 <li><code>egl_&lt;dpy&gt;_swrast</code></li>
 <li><code>egl_&lt;dpy&gt;_vmwgfx</code></li>
 </ul>
 
-<p><code>&lt;dpy&gt;</code> is given by <code>--with-egl-displays</code> at
-configuration time.  There will be one EGL driver for each combination of the
-displays listed and the hardware drivers enabled.</p>
+<p><code>&lt;dpy&gt;</code> is given by <code>--with-egl-platforms</code> at
+configuration time.  There is usually one EGL driver for each combination of
+the platforms listed and the pipe drivers enabled.  When the platform is pure
+software or pure hardware, non-working combinations will not be built.</p>
 
-<p>Classic EGL drivers, on the other hand, supports only OpenGL as its
-rendering API.  They can be found under <code>src/egl/drivers/</code>.  There
-are 3 of them</p>
+<p>Classic EGL drivers, on the other hand, support only a subset of the
+available rendering APIs.  They can be found under
+<code>src/egl/drivers/</code>.  There are 3 of them</p>
 
 <ul>
 <li><code>egl_glx</code>
@@ -324,7 +322,7 @@
 
 <ul>
 <li>Pass the conformance tests</li>
-<li>Better automatic driver selection: <code>EGL_DISPLAY</code> loads all
+<li>Better automatic driver selection: <code>EGL_PLATFORM</code> loads all
 drivers and might eat too much memory.</li>
 
 </ul>
diff --git a/docs/relnotes.html b/docs/relnotes.html
index 39b02b8..47e7f80 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -13,6 +13,7 @@
 </p>
 
 <UL>
+<LI><A HREF="relnotes-7.9.html">7.9 release notes</A>
 <LI><A HREF="relnotes-7.8.2.html">7.8.2 release notes</A>
 <LI><A HREF="relnotes-7.8.1.html">7.8.1 release notes</A>
 <LI><A HREF="relnotes-7.8.html">7.8 release notes</A>
diff --git a/include/EGL/eglext.h b/include/EGL/eglext.h
index ce1dca3..68591bd 100644
--- a/include/EGL/eglext.h
+++ b/include/EGL/eglext.h
@@ -208,6 +208,17 @@
 
 #endif /* EGL_MESA_copy_context */
 
+#ifndef EGL_MESA_drm_display
+#define EGL_MESA_drm_display 1
+
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLDisplay EGLAPIENTRY eglGetDRMDisplayMESA(int fd);
+#endif /* EGL_EGLEXT_PROTOTYPES */
+
+typedef EGLDisplay (EGLAPIENTRYP PFNEGLGETDRMDISPLAYMESA) (int fd);
+
+#endif /* EGL_MESA_drm_display */
+
 #ifndef EGL_KHR_image_base
 #define EGL_KHR_image_base 1
 /* Most interfaces defined by EGL_KHR_image_pixmap above */
diff --git a/include/EGL/eglplatform.h b/include/EGL/eglplatform.h
index c625088..33a3e5f 100644
--- a/include/EGL/eglplatform.h
+++ b/include/EGL/eglplatform.h
@@ -80,6 +80,14 @@
 
 #elif defined(__unix__) || defined(__unix)
 
+#ifdef MESA_EGL_NO_X11_HEADERS
+
+typedef void            *EGLNativeDisplayType;
+typedef khronos_uint32_t EGLNativePixmapType;
+typedef khronos_uint32_t EGLNativeWindowType;
+
+#else
+
 /* X11 (tentative)  */
 #include <X11/Xlib.h>
 #include <X11/Xutil.h>
@@ -88,6 +96,8 @@
 typedef Pixmap   EGLNativePixmapType;
 typedef Window   EGLNativeWindowType;
 
+#endif /* MESA_EGL_NO_X11_HEADERS */
+
 #else
 #error "Platform not recognized"
 #endif
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index aa384cb..5a5e43b 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -702,15 +702,18 @@
    struct dri2_egl_display *dri2_dpy;
    unsigned int api_mask;
 
+   if (disp->Platform != _EGL_PLATFORM_X11)
+      return EGL_FALSE;
+
    dri2_dpy = malloc(sizeof *dri2_dpy);
    if (!dri2_dpy)
       return _eglError(EGL_BAD_ALLOC, "eglInitialize");
 
    disp->DriverData = (void *) dri2_dpy;
-   if (disp->NativeDisplay == NULL) {
+   if (disp->PlatformDisplay == NULL) {
       dri2_dpy->conn = xcb_connect(0, 0);
    } else {
-      dri2_dpy->conn = XGetXCBConnection(disp->NativeDisplay);
+      dri2_dpy->conn = XGetXCBConnection((Display *) disp->PlatformDisplay);
    }
 
    if (xcb_connection_has_error(dri2_dpy->conn)) {
@@ -815,7 +818,7 @@
  cleanup_driver:
    dlclose(dri2_dpy->driver);
  cleanup_conn:
-   if (disp->NativeDisplay == NULL)
+   if (disp->PlatformDisplay == NULL)
       xcb_disconnect(dri2_dpy->conn);
  cleanup_dpy:
    free(dri2_dpy);
@@ -837,7 +840,7 @@
    dri2_dpy->core->destroyScreen(dri2_dpy->dri_screen);
    close(dri2_dpy->fd);
    dlclose(dri2_dpy->driver);
-   if (disp->NativeDisplay == NULL)
+   if (disp->PlatformDisplay == NULL)
       xcb_disconnect(dri2_dpy->conn);
    free(dri2_dpy);
    disp->DriverData = NULL;
diff --git a/src/egl/drivers/glx/egl_glx.c b/src/egl/drivers/glx/egl_glx.c
index e08ef5f..804dc02 100644
--- a/src/egl/drivers/glx/egl_glx.c
+++ b/src/egl/drivers/glx/egl_glx.c
@@ -498,11 +498,14 @@
 {
    struct GLX_egl_display *GLX_dpy;
 
+   if (disp->Platform != _EGL_PLATFORM_X11)
+      return EGL_FALSE;
+
    GLX_dpy = CALLOC_STRUCT(GLX_egl_display);
    if (!GLX_dpy)
       return _eglError(EGL_BAD_ALLOC, "eglInitialize");
 
-   GLX_dpy->dpy = (Display *) disp->NativeDisplay;
+   GLX_dpy->dpy = (Display *) disp->PlatformDisplay;
    if (!GLX_dpy->dpy) {
       GLX_dpy->dpy = XOpenDisplay(NULL);
       if (!GLX_dpy->dpy) {
@@ -514,7 +517,7 @@
 
    if (!glXQueryVersion(GLX_dpy->dpy, &GLX_dpy->glx_maj, &GLX_dpy->glx_min)) {
       _eglLog(_EGL_WARNING, "GLX: glXQueryVersion failed");
-      if (!disp->NativeDisplay)
+      if (!disp->PlatformDisplay)
          XCloseDisplay(GLX_dpy->dpy);
       free(GLX_dpy);
       return EGL_FALSE;
@@ -526,7 +529,7 @@
    create_configs(disp, GLX_dpy, DefaultScreen(GLX_dpy->dpy));
    if (!disp->NumConfigs) {
       _eglLog(_EGL_WARNING, "GLX: failed to create any config");
-      if (!disp->NativeDisplay)
+      if (!disp->PlatformDisplay)
          XCloseDisplay(GLX_dpy->dpy);
       free(GLX_dpy);
       return EGL_FALSE;
@@ -558,7 +561,7 @@
    if (GLX_dpy->fbconfigs)
       XFree(GLX_dpy->fbconfigs);
 
-   if (!disp->NativeDisplay)
+   if (!disp->PlatformDisplay)
       XCloseDisplay(GLX_dpy->dpy);
    free(GLX_dpy);
 
@@ -617,10 +620,11 @@
 static void
 destroy_surface(_EGLDisplay *disp, _EGLSurface *surf)
 {
+   struct GLX_egl_display *GLX_dpy = GLX_egl_display(disp);
    struct GLX_egl_surface *GLX_surf = GLX_egl_surface(surf);
 
    if (GLX_surf->destroy)
-      GLX_surf->destroy(disp->NativeDisplay, GLX_surf->glx_drawable);
+      GLX_surf->destroy(GLX_dpy->dpy, GLX_surf->glx_drawable);
 
    free(GLX_surf);
 }
diff --git a/src/egl/main/Makefile b/src/egl/main/Makefile
index 82fd855..be27d94 100644
--- a/src/egl/main/Makefile
+++ b/src/egl/main/Makefile
@@ -7,7 +7,7 @@
 EGL_MAJOR = 1
 EGL_MINOR = 0
 
-INCLUDE_DIRS = -I$(TOP)/include $(X11_CFLAGS)
+INCLUDE_DIRS = -I$(TOP)/include
 
 HEADERS = \
 	eglcompiler.h \
@@ -49,12 +49,25 @@
 
 
 # use dl*() to load drivers
-LOCAL_CFLAGS = -D_EGL_PLATFORM_POSIX=1
+LOCAL_CFLAGS = -D_EGL_OS_UNIX=1
 
-EGL_DEFAULT_DISPLAY = $(word 1, $(EGL_DISPLAYS))
+EGL_DEFAULT_PLATFORM = $(firstword $(EGL_PLATFORMS))
+
+# translate --with-egl-platforms to _EGLPlatformType
+EGL_NATIVE_PLATFORM=_EGL_INVALID_PLATFORM
+ifeq ($(firstword $(EGL_PLATFORMS)),x11)
+EGL_NATIVE_PLATFORM=_EGL_PLATFORM_X11
+endif
+ifeq ($(firstword $(EGL_PLATFORMS)),kms)
+EGL_NATIVE_PLATFORM=_EGL_PLATFORM_DRM
+endif
+ifeq ($(firstword $(EGL_PLATFORMS)),fbdev)
+EGL_NATIVE_PLATFORM=_EGL_PLATFORM_FBDEV
+endif
 
 LOCAL_CFLAGS += \
-	-D_EGL_DEFAULT_DISPLAY=\"$(EGL_DEFAULT_DISPLAY)\" \
+	-D_EGL_NATIVE_PLATFORM=$(EGL_NATIVE_PLATFORM) \
+	-D_EGL_DEFAULT_PLATFORM=\"$(EGL_DEFAULT_PLATFORM)\" \
 	-D_EGL_DRIVER_SEARCH_DIR=\"$(EGL_DRIVER_INSTALL_DIR)\"
 
 .c.o:
diff --git a/src/egl/main/SConscript b/src/egl/main/SConscript
index f3fe996..fad0671 100644
--- a/src/egl/main/SConscript
+++ b/src/egl/main/SConscript
@@ -9,9 +9,10 @@
 	env = env.Clone()
 
 	env.Append(CPPDEFINES = [
-		'_EGL_DEFAULT_DISPLAY=\\"gdi\\"',
+		'_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_WINDOWS',
+		'_EGL_DEFAULT_PLATFORM=\\"gdi\\"',
 		'_EGL_DRIVER_SEARCH_DIR=\\"\\"',
-		'_EGL_PLATFORM_WINDOWS',
+		'_EGL_OS_WINDOWS',
 		'KHRONOS_DLL_EXPORTS',
 	])
 
diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index 9912043..1ec1486 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -250,7 +250,8 @@
 EGLDisplay EGLAPIENTRY
 eglGetDisplay(EGLNativeDisplayType nativeDisplay)
 {
-   _EGLDisplay *dpy = _eglFindDisplay(nativeDisplay);
+   _EGLPlatformType plat = _eglGetNativePlatform();
+   _EGLDisplay *dpy = _eglFindDisplay(plat, (void *) nativeDisplay);
    return _eglGetDisplayHandle(dpy);
 }
 
@@ -491,6 +492,8 @@
    EGLSurface ret;
 
    _EGL_CHECK_CONFIG(disp, conf, EGL_NO_SURFACE, drv);
+   if (disp->Platform != _eglGetNativePlatform())
+      RETURN_EGL_ERROR(disp, EGL_BAD_NATIVE_WINDOW, EGL_NO_SURFACE);
 
    surf = drv->API.CreateWindowSurface(drv, disp, conf, window, attrib_list);
    ret = (surf) ? _eglLinkSurface(surf, disp) : EGL_NO_SURFACE;
@@ -510,6 +513,8 @@
    EGLSurface ret;
 
    _EGL_CHECK_CONFIG(disp, conf, EGL_NO_SURFACE, drv);
+   if (disp->Platform != _eglGetNativePlatform())
+      RETURN_EGL_ERROR(disp, EGL_BAD_NATIVE_PIXMAP, EGL_NO_SURFACE);
 
    surf = drv->API.CreatePixmapSurface(drv, disp, conf, pixmap, attrib_list);
    ret = (surf) ? _eglLinkSurface(surf, disp) : EGL_NO_SURFACE;
@@ -667,6 +672,8 @@
    EGLBoolean ret;
 
    _EGL_CHECK_SURFACE(disp, surf, EGL_FALSE, drv);
+   if (disp->Platform != _eglGetNativePlatform())
+      RETURN_EGL_ERROR(disp, EGL_BAD_NATIVE_PIXMAP, EGL_FALSE);
    ret = drv->API.CopyBuffers(drv, disp, surf, target);
 
    RETURN_EGL_EVAL(disp, ret);
@@ -836,6 +843,9 @@
       { "eglQueryScreenModeMESA", (_EGLProc) eglQueryScreenModeMESA },
       { "eglQueryModeStringMESA", (_EGLProc) eglQueryModeStringMESA },
 #endif /* EGL_MESA_screen_surface */
+#ifdef EGL_MESA_drm_display
+      { "eglGetDRMDisplayMESA", (_EGLProc) eglGetDRMDisplayMESA },
+#endif
 #ifdef EGL_KHR_image_base
       { "eglCreateImageKHR", (_EGLProc) eglCreateImageKHR },
       { "eglDestroyImageKHR", (_EGLProc) eglDestroyImageKHR },
@@ -1098,6 +1108,17 @@
 #endif /* EGL_MESA_screen_surface */
 
 
+#ifdef EGL_MESA_drm_display
+
+EGLDisplay EGLAPIENTRY
+eglGetDRMDisplayMESA(int fd)
+{
+   _EGLDisplay *dpy = _eglFindDisplay(_EGL_PLATFORM_DRM, (void *) fd);
+   return _eglGetDisplayHandle(dpy);
+}
+
+#endif /* EGL_MESA_drm_display */
+
 /**
  ** EGL 1.2
  **/
diff --git a/src/egl/main/egldisplay.c b/src/egl/main/egldisplay.c
index 5dc5fd9..d666bda 100644
--- a/src/egl/main/egldisplay.c
+++ b/src/egl/main/egldisplay.c
@@ -49,16 +49,19 @@
  * new one.
  */
 _EGLDisplay *
-_eglFindDisplay(EGLNativeDisplayType nativeDisplay)
+_eglFindDisplay(_EGLPlatformType plat, void *plat_dpy)
 {
    _EGLDisplay *dpy;
 
+   if (plat == _EGL_INVALID_PLATFORM)
+      return NULL;
+
    _eglLockMutex(_eglGlobal.Mutex);
 
    /* search the display list first */
    dpy = _eglGlobal.DisplayList;
    while (dpy) {
-      if (dpy->NativeDisplay == nativeDisplay)
+      if (dpy->Platform == plat && dpy->PlatformDisplay == plat_dpy)
          break;
       dpy = dpy->Next;
    }
@@ -68,7 +71,8 @@
       dpy = (_EGLDisplay *) calloc(1, sizeof(_EGLDisplay));
       if (dpy) {
          _eglInitMutex(&dpy->Mutex);
-         dpy->NativeDisplay = nativeDisplay;
+         dpy->Platform = plat;
+         dpy->PlatformDisplay = plat_dpy;
 
          /* add to the display list */ 
          dpy->Next = _eglGlobal.DisplayList;
diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h
index 42e305f..0b325f7 100644
--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -7,6 +7,18 @@
 #include "eglmutex.h"
 
 
+enum _egl_platform_type {
+   _EGL_PLATFORM_WINDOWS,
+   _EGL_PLATFORM_X11,
+   _EGL_PLATFORM_DRM,
+   _EGL_PLATFORM_FBDEV,
+
+   _EGL_NUM_PLATFORMS,
+   _EGL_INVALID_PLATFORM = -1
+};
+typedef enum _egl_platform_type _EGLPlatformType;
+
+
 enum _egl_resource_type {
    _EGL_RESOURCE_CONTEXT,
    _EGL_RESOURCE_SURFACE,
@@ -39,6 +51,7 @@
 {
    EGLBoolean MESA_screen_surface;
    EGLBoolean MESA_copy_context;
+   EGLBoolean MESA_drm_display;
    EGLBoolean KHR_image_base;
    EGLBoolean KHR_image_pixmap;
    EGLBoolean KHR_vg_parent_image;
@@ -53,14 +66,15 @@
 };
 
 
-struct _egl_display 
+struct _egl_display
 {
    /* used to link displays */
    _EGLDisplay *Next;
 
    _EGLMutex Mutex;
 
-   EGLNativeDisplayType NativeDisplay;
+   _EGLPlatformType Platform;
+   void *PlatformDisplay;
 
    EGLBoolean Initialized; /**< True if the display is initialized */
    _EGLDriver *Driver;
@@ -92,7 +106,7 @@
 
 
 extern _EGLDisplay *
-_eglFindDisplay(EGLNativeDisplayType displayName);
+_eglFindDisplay(_EGLPlatformType plat, void *plat_dpy);
 
 
 PUBLIC void
diff --git a/src/egl/main/egldriver.c b/src/egl/main/egldriver.c
index 631a871..db7b4a7 100644
--- a/src/egl/main/egldriver.c
+++ b/src/egl/main/egldriver.c
@@ -23,7 +23,7 @@
 #include "eglsurface.h"
 #include "eglimage.h"
 
-#if defined(_EGL_PLATFORM_POSIX)
+#if defined(_EGL_OS_UNIX)
 #include <dlfcn.h>
 #include <sys/types.h>
 #include <dirent.h>
@@ -34,7 +34,7 @@
 /**
  * Wrappers for dlopen/dlclose()
  */
-#if defined(_EGL_PLATFORM_WINDOWS)
+#if defined(_EGL_OS_WINDOWS)
 
 
 /* XXX Need to decide how to do dynamic name lookup on Windows */
@@ -64,7 +64,7 @@
 }
 
 
-#elif defined(_EGL_PLATFORM_POSIX)
+#elif defined(_EGL_OS_UNIX)
 
 
 static const char *DefaultDriverNames[] = {
@@ -119,11 +119,11 @@
    _eglLog(_EGL_DEBUG, "dlopen(%s)", driverPath);
    lib = open_library(driverPath);
 
-#if defined(_EGL_PLATFORM_WINDOWS)
+#if defined(_EGL_OS_WINDOWS)
    /* XXX untested */
    if (lib)
       mainFunc = (_EGLMain_t) GetProcAddress(lib, "_eglMain");
-#elif defined(_EGL_PLATFORM_POSIX)
+#elif defined(_EGL_OS_UNIX)
    if (lib) {
       union {
          _EGLMain_t func;
@@ -301,7 +301,7 @@
 static EGLBoolean
 _eglLoaderPattern(const char *dir, size_t len, void *loader_data)
 {
-#if defined(_EGL_PLATFORM_POSIX)
+#if defined(_EGL_OS_UNIX)
    const char *prefix, *suffix;
    size_t prefix_len, suffix_len;
    DIR *dirp;
@@ -352,7 +352,7 @@
    closedir(dirp);
 
    return EGL_TRUE;
-#else /* _EGL_PLATFORM_POSIX */
+#else /* _EGL_OS_UNIX */
    /* stop immediately */
    return EGL_FALSE;
 #endif
@@ -397,20 +397,20 @@
 {
    static const char *search_path;
 
-#if defined(_EGL_PLATFORM_POSIX) || defined(_EGL_PLATFORM_WINDOWS)
+#if defined(_EGL_OS_UNIX) || defined(_EGL_OS_WINDOWS)
    if (!search_path) {
       static char buffer[1024];
       const char *p;
       int ret;
 
       p = getenv("EGL_DRIVERS_PATH");
-#if defined(_EGL_PLATFORM_POSIX)
+#if defined(_EGL_OS_UNIX)
       if (p && (geteuid() != getuid() || getegid() != getgid())) {
          _eglLog(_EGL_DEBUG,
                "ignore EGL_DRIVERS_PATH for setuid/setgid binaries");
          p = NULL;
       }
-#endif /* _EGL_PLATFORM_POSIX */
+#endif /* _EGL_OS_UNIX */
 
       if (p) {
          ret = _eglsnprintf(buffer, sizeof(buffer),
@@ -441,7 +441,7 @@
    char *env;
 
    env = getenv("EGL_DRIVER");
-#if defined(_EGL_PLATFORM_POSIX)
+#if defined(_EGL_OS_UNIX)
    if (env && strchr(env, '/')) {
       search_path = "";
       if ((geteuid() != getuid() || getegid() != getgid())) {
@@ -450,7 +450,7 @@
          env = NULL;
       }
    }
-#endif /* _EGL_PLATFORM_POSIX */
+#endif /* _EGL_OS_UNIX */
    if (!env)
       return EGL_FALSE;
 
@@ -464,24 +464,27 @@
 
 
 /**
- * Preload display drivers.
+ * Preload platform drivers.
  *
- * Display drivers are a set of drivers that support a certain display system.
- * The display system may be specified by EGL_DISPLAY.
+ * Platform drivers are a set of drivers that support a certain window system.
+ * The window system may be specified by EGL_PLATFORM.
  *
  * FIXME This makes libEGL a memory hog if an user driver is not specified and
- * there are many display drivers.
+ * there are many platform drivers.
  */
 static EGLBoolean
-_eglPreloadDisplayDrivers(void)
+_eglPreloadPlatformDrivers(void)
 {
    const char *dpy;
    char prefix[32];
    int ret;
 
-   dpy = getenv("EGL_DISPLAY");
+   dpy = getenv("EGL_PLATFORM");
+   /* try deprecated env variable */
    if (!dpy || !dpy[0])
-      dpy = _EGL_DEFAULT_DISPLAY;
+      dpy = getenv("EGL_DISPLAY");
+   if (!dpy || !dpy[0])
+      dpy = _EGL_DEFAULT_PLATFORM;
    if (!dpy || !dpy[0])
       return EGL_FALSE;
 
@@ -515,7 +518,7 @@
    }
 
    loaded = (_eglPreloadUserDriver() ||
-             _eglPreloadDisplayDrivers());
+             _eglPreloadPlatformDrivers());
 
    _eglUnlockMutex(_eglGlobal.Mutex);
 
@@ -578,6 +581,16 @@
 
 
 /**
+ * Return the native platform.  It is the platform of the EGL native types.
+ */
+_EGLPlatformType
+_eglGetNativePlatform(void)
+{
+   return _EGL_NATIVE_PLATFORM;
+}
+
+
+/**
  * Plug all the available fallback routines into the given driver's
  * dispatch table.
  */
diff --git a/src/egl/main/egldriver.h b/src/egl/main/egldriver.h
index 8b34c43..6a52374 100644
--- a/src/egl/main/egldriver.h
+++ b/src/egl/main/egldriver.h
@@ -3,6 +3,7 @@
 
 
 #include "egltypedefs.h"
+#include "egldisplay.h"
 #include "eglapi.h"
 
 
@@ -88,6 +89,10 @@
 _eglLoadDefaultDriver(EGLDisplay dpy, EGLint *major, EGLint *minor);
 
 
+extern _EGLPlatformType
+_eglGetNativePlatform(void);
+
+
 PUBLIC void
 _eglInitDriverFallbacks(_EGLDriver *drv);
 
diff --git a/src/egl/main/eglmisc.c b/src/egl/main/eglmisc.c
index 4652969..281138c 100644
--- a/src/egl/main/eglmisc.c
+++ b/src/egl/main/eglmisc.c
@@ -84,6 +84,7 @@
 
    _EGL_CHECK_EXTENSION(MESA_screen_surface);
    _EGL_CHECK_EXTENSION(MESA_copy_context);
+   _EGL_CHECK_EXTENSION(MESA_drm_display);
 
    _EGL_CHECK_EXTENSION(KHR_image_base);
    _EGL_CHECK_EXTENSION(KHR_image_pixmap);
diff --git a/src/egl/main/eglstring.h b/src/egl/main/eglstring.h
index bebb758..f1d559b 100644
--- a/src/egl/main/eglstring.h
+++ b/src/egl/main/eglstring.h
@@ -3,7 +3,7 @@
 
 #include <string.h>
 
-#ifdef _EGL_PLATFORM_WINDOWS
+#ifdef _EGL_OS_WINDOWS
 #define _eglstrcasecmp _stricmp
 #define _eglsnprintf _snprintf
 #else
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 7c77025..dab95e5 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -435,13 +435,18 @@
  */
 void
 draw_texture_samplers(struct draw_context *draw,
+                      uint shader,
                       uint num_samplers,
                       struct tgsi_sampler **samplers)
 {
-   draw->vs.num_samplers = num_samplers;
-   draw->vs.samplers = samplers;
-   draw->gs.num_samplers = num_samplers;
-   draw->gs.samplers = samplers;
+   if (shader == PIPE_SHADER_VERTEX) {
+      draw->vs.num_samplers = num_samplers;
+      draw->vs.samplers = samplers;
+   } else {
+      debug_assert(shader == PIPE_SHADER_GEOMETRY);
+      draw->gs.num_samplers = num_samplers;
+      draw->gs.samplers = samplers;
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 103d653..c0122f2 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -97,6 +97,7 @@
 
 void
 draw_texture_samplers(struct draw_context *draw,
+                      uint shader_type,
                       uint num_samplers,
                       struct tgsi_sampler **samplers);
 
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index a1ca707..79a57a6 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -39,7 +39,6 @@
 #include "util/u_memory.h"
 #include "util/u_prim.h"
 
-#define MAX_PRIM_VERTICES 6
 /* fixme: move it from here */
 #define MAX_PRIMITIVES 64
 
@@ -76,6 +75,7 @@
                       const void *constants,
                       unsigned size)
 {
+   /* noop */
 }
 
 
@@ -171,9 +171,10 @@
    /* Unswizzle all output results.
     */
 
-   shader->emitted_primitives += num_primitives;
    for (prim_idx = 0; prim_idx < num_primitives; ++prim_idx) {
       unsigned num_verts_per_prim = machine->Primitives[prim_idx];
+      shader->primitive_lengths[prim_idx +   shader->emitted_primitives] =
+         machine->Primitives[prim_idx];
       shader->emitted_vertices += num_verts_per_prim;
       for (j = 0; j < num_verts_per_prim; j++) {
          int idx = (prim_idx * num_verts_per_prim + j) *
@@ -199,9 +200,10 @@
       }
    }
    *p_output = output;
+         shader->emitted_primitives += num_primitives;
 }
 
-
+/*#define DEBUG_INPUTS 1*/
 static void draw_fetch_gs_input(struct draw_geometry_shader *shader,
                                 unsigned *indices,
                                 unsigned num_vertices,
@@ -216,19 +218,28 @@
 
    for (i = 0; i < num_vertices; ++i) {
       const float (*input)[4];
-      /*debug_printf("%d) vertex index = %d (prim idx = %d)\n", i, indices[i], prim_idx);*/
+#if DEBUG_INPUTS
+      debug_printf("%d) vertex index = %d (prim idx = %d)\n",
+                   i, indices[i], prim_idx);
+#endif
       input = (const float (*)[4])(
          (const char *)input_ptr + (indices[i] * input_vertex_stride));
       for (slot = 0, vs_slot = 0; slot < shader->info.num_inputs; ++slot) {
          unsigned idx = i * TGSI_EXEC_MAX_INPUT_ATTRIBS + slot;
          if (shader->info.input_semantic_name[slot] == TGSI_SEMANTIC_PRIMID) {
-            machine->Inputs[idx].xyzw[0].f[prim_idx] = (float)shader->in_prim_idx;
-            machine->Inputs[idx].xyzw[1].f[prim_idx] = (float)shader->in_prim_idx;
-            machine->Inputs[idx].xyzw[2].f[prim_idx] = (float)shader->in_prim_idx;
-            machine->Inputs[idx].xyzw[3].f[prim_idx] = (float)shader->in_prim_idx;
+            machine->Inputs[idx].xyzw[0].f[prim_idx] =
+               (float)shader->in_prim_idx;
+            machine->Inputs[idx].xyzw[1].f[prim_idx] =
+               (float)shader->in_prim_idx;
+            machine->Inputs[idx].xyzw[2].f[prim_idx] =
+               (float)shader->in_prim_idx;
+            machine->Inputs[idx].xyzw[3].f[prim_idx] =
+               (float)shader->in_prim_idx;
          } else {
-            /*debug_printf("\tSlot = %d, vs_slot = %d, idx = %d:\n",
-              slot, vs_slot, idx);*/
+#if DEBUG_INPUTS
+            debug_printf("\tSlot = %d, vs_slot = %d, idx = %d:\n",
+                         slot, vs_slot, idx);
+#endif
 #if 1
             assert(!util_is_inf_or_nan(input[vs_slot][0]));
             assert(!util_is_inf_or_nan(input[vs_slot][1]));
@@ -239,7 +250,7 @@
             machine->Inputs[idx].xyzw[1].f[prim_idx] = input[vs_slot][1];
             machine->Inputs[idx].xyzw[2].f[prim_idx] = input[vs_slot][2];
             machine->Inputs[idx].xyzw[3].f[prim_idx] = input[vs_slot][3];
-#if 0
+#if DEBUG_INPUTS
             debug_printf("\t\t%f %f %f %f\n",
                          machine->Inputs[idx].xyzw[0].f[prim_idx],
                          machine->Inputs[idx].xyzw[1].f[prim_idx],
@@ -252,7 +263,6 @@
    }
 }
 
-
 static void gs_flush(struct draw_geometry_shader *shader,
                      unsigned input_primitives)
 {
@@ -274,6 +284,11 @@
    out_prim_count =
       machine->Temps[TGSI_EXEC_TEMP_PRIMITIVE_I].xyzw[TGSI_EXEC_TEMP_PRIMITIVE_C].u[0];
 
+#if 0
+   debug_printf("PRIM emitted prims = %d (verts=%d), cur prim count = %d\n",
+                shader->emitted_primitives, shader->emitted_vertices,
+                out_prim_count);
+#endif
    draw_geometry_fetch_outputs(shader, out_prim_count,
                                &shader->tmp_output);
 }
@@ -305,6 +320,22 @@
    gs_flush(shader, 1);
 }
 
+static void gs_line_adj(struct draw_geometry_shader *shader,
+                        int i0, int i1, int i2, int i3)
+{
+   unsigned indices[4];
+
+   indices[0] = i0;
+   indices[1] = i1;
+   indices[2] = i2;
+   indices[3] = i3;
+
+   draw_fetch_gs_input(shader, indices, 4, 0);
+   ++shader->in_prim_idx;
+
+   gs_flush(shader, 1);
+}
+
 static void gs_tri(struct draw_geometry_shader *shader,
                    int i0, int i1, int i2)
 {
@@ -320,58 +351,130 @@
    gs_flush(shader, 1);
 }
 
+static void gs_tri_adj(struct draw_geometry_shader *shader,
+                       int i0, int i1, int i2,
+                       int i3, int i4, int i5)
+{
+   unsigned indices[6];
+
+   indices[0] = i0;
+   indices[1] = i1;
+   indices[2] = i2;
+   indices[3] = i3;
+   indices[4] = i4;
+   indices[5] = i5;
+
+   draw_fetch_gs_input(shader, indices, 6, 0);
+   ++shader->in_prim_idx;
+
+   gs_flush(shader, 1);
+}
+
 #define TRIANGLE(gs,i0,i1,i2) gs_tri(gs,i0,i1,i2)
-#define LINE(gs,i0,i1)  gs_line(gs,i0,i1)
+#define TRI_ADJ(gs,i0,i1,i2,i3,i4,i5)  gs_tri_adj(gs,i0,i1,i2,i3,i4,i5)
+#define LINE(gs,i0,i1)        gs_line(gs,i0,i1)
+#define LINE_ADJ(gs,i0,i1,i2,i3)    gs_line_adj(gs,i0,i1,i2,i3)
 #define POINT(gs,i0)          gs_point(gs,i0)
 #define FUNC gs_run
+#define LOCAL_VARS
+#include "draw_gs_tmp.h"
+
+
+#define TRIANGLE(gs,i0,i1,i2) gs_tri(gs,elts[i0],elts[i1],elts[i2])
+#define TRI_ADJ(gs,i0,i1,i2,i3,i4,i5)           \
+   gs_tri_adj(gs,elts[i0],elts[i1],elts[i2],elts[i3], \
+              elts[i4],elts[i5])
+#define LINE(gs,i0,i1)        gs_line(gs,elts[i0],elts[i1])
+#define LINE_ADJ(gs,i0,i1,i2,i3)  gs_line_adj(gs,elts[i0],      \
+                                              elts[i1],         \
+                                              elts[i2],elts[i3])
+#define POINT(gs,i0)          gs_point(gs,elts[i0])
+#define FUNC gs_run_elts
+#define LOCAL_VARS                         \
+   const ushort *elts = input_prims->elts;
 #include "draw_gs_tmp.h"
 
 int draw_geometry_shader_run(struct draw_geometry_shader *shader,
-                             unsigned pipe_prim,
-                             const float (*input)[4],
-                             float (*output)[4],
-                             const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
-                             unsigned count,
-                             unsigned input_stride,
-                             unsigned vertex_size)
+                             const void *constants[PIPE_MAX_CONSTANT_BUFFERS], 
+                             const struct draw_vertex_info *input_verts,
+                             const struct draw_prim_info *input_prim,
+                             struct draw_vertex_info *output_verts,
+                             struct draw_prim_info *output_prims )
 {
+   const float (*input)[4] = (const float (*)[4])input_verts->verts->data;
+   unsigned input_stride = input_verts->vertex_size;
+   unsigned vertex_size = input_verts->vertex_size;
    struct tgsi_exec_machine *machine = shader->machine;
    unsigned int i;
+   unsigned num_input_verts = input_prim->linear ?
+                              input_verts->count :
+                              input_prim->count;
    unsigned num_in_primitives =
-      u_gs_prims_for_vertices(pipe_prim, count);
-   unsigned alloc_count = draw_max_output_vertices(shader->draw,
-                                                   pipe_prim,
-                                                   count);
-   /* this is bad, but we can't be overwriting the output array
-    * because it's the same as input array here */
-   struct vertex_header *pipeline_verts =
-      (struct vertex_header *)MALLOC(vertex_size * alloc_count);
+      MAX2(u_gs_prims_for_vertices(input_prim->prim, num_input_verts),
+           u_gs_prims_for_vertices(shader->input_primitive, num_input_verts));
+   unsigned max_out_prims = u_gs_prims_for_vertices(shader->output_primitive,
+                                                    shader->max_output_vertices)
+                            * num_in_primitives;
 
-   if (!pipeline_verts)
-      return 0;
+   output_verts->vertex_size = input_verts->vertex_size;
+   output_verts->stride = input_verts->vertex_size;
+   output_verts->verts =
+      (struct vertex_header *)MALLOC(input_verts->vertex_size *
+                                     num_in_primitives *
+                                     shader->max_output_vertices);
 
-   if (0) debug_printf("%s count = %d (prims = %d)\n", __FUNCTION__,
-                       count, num_in_primitives);
+
+#if 0
+   debug_printf("%s count = %d (in prims # = %d)\n",
+                __FUNCTION__, num_input_verts, num_in_primitives);
+   debug_printf("\tlinear = %d, prim_info->count = %d\n",
+                input_prim->linear, input_prim->count);
+   debug_printf("\tprimt pipe = %d, shader in = %d, shader out = %d, max out = %d\n",
+                input_prim->prim, shader->input_primitive,
+                shader->output_primitive,
+                shader->max_output_vertices);
+#endif
 
    shader->emitted_vertices = 0;
    shader->emitted_primitives = 0;
    shader->vertex_size = vertex_size;
-   shader->tmp_output = (      float (*)[4])pipeline_verts->data;
+   shader->tmp_output = (float (*)[4])output_verts->verts->data;
    shader->in_prim_idx = 0;
    shader->input_vertex_stride = input_stride;
    shader->input = input;
+   if (shader->primitive_lengths) {
+      FREE(shader->primitive_lengths);
+   }
+   shader->primitive_lengths = MALLOC(max_out_prims * sizeof(unsigned));
 
    for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
       machine->Consts[i] = constants[i];
    }
 
-   gs_run(shader, pipe_prim, count);
+   if (input_prim->linear)
+      gs_run(shader, input_prim, input_verts,
+             output_prims, output_verts);
+   else
+      gs_run_elts(shader, input_prim, input_verts,
+                  output_prims, output_verts);
 
-   memcpy(output, pipeline_verts->data,
-          shader->info.num_outputs * 4 * sizeof(float) +
-          vertex_size * (shader->emitted_vertices -1));
+   /* Update prim_info:
+    */
+   output_prims->linear = TRUE;
+   output_prims->elts = NULL;
+   output_prims->start = 0;
+   output_prims->count = shader->emitted_vertices;
+   output_prims->prim = shader->output_primitive;
+   output_prims->primitive_lengths = shader->primitive_lengths;
+   output_prims->primitive_count = shader->emitted_primitives;
+   output_verts->count = shader->emitted_vertices;
 
-   FREE(pipeline_verts);
+#if 0
+   debug_printf("GS finished, prims = %d, verts = %d\n",
+                output_prims->primitive_count,
+                output_verts->count);
+#endif
+
    return shader->emitted_vertices;
 }
 
@@ -391,24 +494,3 @@
                                     draw->gs.samplers);
    }
 }
-
-int draw_max_output_vertices(struct draw_context *draw,
-                             unsigned pipe_prim,
-                             unsigned count)
-{
-   unsigned alloc_count = align( count, 4 );
-
-   if (draw->gs.geometry_shader) {
-      unsigned input_primitives = u_gs_prims_for_vertices(pipe_prim,
-                                                          count);
-      /* max GS output is number of input primitives * max output
-       * vertices per each invocation */
-      unsigned gs_max_verts = input_primitives *
-                              draw->gs.geometry_shader->max_output_vertices;
-      if (gs_max_verts > count)
-         alloc_count = align(gs_max_verts, 4);
-   }
-   /*debug_printf("------- alloc count = %d (input = %d)\n",
-                  alloc_count, count);*/
-   return alloc_count;
-}
diff --git a/src/gallium/auxiliary/draw/draw_gs.h b/src/gallium/auxiliary/draw/draw_gs.h
index 65f0c61..2cb6348 100644
--- a/src/gallium/auxiliary/draw/draw_gs.h
+++ b/src/gallium/auxiliary/draw/draw_gs.h
@@ -54,6 +54,7 @@
    unsigned input_primitive;
    unsigned output_primitive;
 
+   unsigned *primitive_lengths;
    unsigned emitted_vertices;
    unsigned emitted_primitives;
 
@@ -71,13 +72,11 @@
  * smaller than the GS_MAX_OUTPUT_VERTICES shader property.
  */
 int draw_geometry_shader_run(struct draw_geometry_shader *shader,
-                             unsigned pipe_prim,
-                             const float (*input)[4],
-                             float (*output)[4],
-                             const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
-                             unsigned count,
-                             unsigned input_stride,
-                             unsigned output_stride);
+                             const void *constants[PIPE_MAX_CONSTANT_BUFFERS], 
+                             const struct draw_vertex_info *input_verts,
+                             const struct draw_prim_info *input_prim,
+                             struct draw_vertex_info *output_verts,
+                             struct draw_prim_info *output_prims );
 
 void draw_geometry_shader_prepare(struct draw_geometry_shader *shader,
                                   struct draw_context *draw);
diff --git a/src/gallium/auxiliary/draw/draw_gs_tmp.h b/src/gallium/auxiliary/draw/draw_gs_tmp.h
index eb4a313..7a8683c 100644
--- a/src/gallium/auxiliary/draw/draw_gs_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_gs_tmp.h
@@ -1,18 +1,23 @@
 
 static void FUNC( struct draw_geometry_shader *shader,
-                  unsigned pipe_prim,
-                  unsigned count )
+                  const struct draw_prim_info *input_prims,
+                  const struct draw_vertex_info *input_verts,
+                  struct draw_prim_info *output_prims,
+                  struct draw_vertex_info *output_verts)
 {
    struct draw_context *draw = shader->draw;
 
    boolean flatfirst = (draw->rasterizer->flatshade &&
                         draw->rasterizer->flatshade_first);
-   unsigned i;
+   unsigned i, j;
+   unsigned count = input_prims->count;
+   LOCAL_VARS
 
    if (0) debug_printf("%s %d\n", __FUNCTION__, count);
 
+   debug_assert(input_prims->primitive_count == 1);
 
-   switch (pipe_prim) {
+   switch (input_prims->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < count; i++) {
 	 POINT( shader, i + 0 );
@@ -90,20 +95,6 @@
 
    case PIPE_PRIM_POLYGON:
       {
-         /* These bitflags look a little odd because we submit the
-          * vertices as (1,2,0) to satisfy flatshade requirements.
-          */
-         ushort edge_next, edge_finish;
-
-         if (flatfirst) {
-            edge_next = DRAW_PIPE_EDGE_FLAG_2;
-            edge_finish = DRAW_PIPE_EDGE_FLAG_0;
-         }
-         else {
-            edge_next = DRAW_PIPE_EDGE_FLAG_0;
-            edge_finish = DRAW_PIPE_EDGE_FLAG_1;
-         }
-
 	 for (i = 0; i+2 < count; i++) {
 
             if (flatfirst) {
@@ -116,14 +107,46 @@
       }
       break;
 
+   case PIPE_PRIM_LINES_ADJACENCY:
+      for (i = 0; i+3 < count; i += 4) {
+         LINE_ADJ( shader , i + 0 , i + 1, i + 2, i + 3 );
+      }
+      break;
+   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+      for (i = 1; i + 2 < count; i++) {
+         LINE_ADJ( shader, i - 1, i, i + 1, i + 2 );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES_ADJACENCY:
+      for (i = 0; i+5 < count; i += 5) {
+         TRI_ADJ( shader, i + 0, i + 1, i + 2,
+                  i + 3, i + 4, i + 5);
+      }
+      break;
+   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      for (i = 0, j = 0; i+5 < count; i += 2, ++j) {
+         TRI_ADJ( shader,
+                  i + 0,
+                  i + 1 + 2*(j&1),
+                  i + 2 + 2*(j&1),
+                  i + 3 - 2*(j&1),
+                  i + 4 - 2*(j&1),
+                  i + 5);
+      }
+      break;
+
    default:
-      assert(0);
+      debug_assert(!"Unsupported primitive in geometry shader");
       break;
    }
 }
 
 
 #undef TRIANGLE
+#undef TRI_ADJ
 #undef POINT
 #undef LINE
+#undef LINE_ADJ
 #undef FUNC
+#undef LOCAL_VARS
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index bd5d885..9117c13 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -173,6 +173,8 @@
 #endif
 
    llvm = CALLOC_STRUCT( draw_llvm );
+   if (!llvm)
+      return NULL;
 
    llvm->draw = draw;
    llvm->engine = draw->engine;
diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c
index 7ea04e3..a8b9dc6 100644
--- a/src/gallium/auxiliary/draw/draw_pipe.c
+++ b/src/gallium/auxiliary/draw/draw_pipe.c
@@ -177,15 +177,15 @@
                 ( DRAW_PIPE_RESET_STIPPLE |     \
                   DRAW_PIPE_EDGE_FLAG_0 |       \
                   DRAW_PIPE_EDGE_FLAG_1 ),      \
-                verts + stride * elts[i0],      \
-                verts + stride * elts[i1],      \
-                verts + stride * elts[i2]);     \
+                verts + stride * (elts[i0] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i1] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i2] & ~DRAW_PIPE_FLAG_MASK));    \
    do_triangle( draw,                           \
                 ( DRAW_PIPE_EDGE_FLAG_1 |       \
                   DRAW_PIPE_EDGE_FLAG_2 ),      \
-                verts + stride * elts[i0],      \
-                verts + stride * elts[i2],      \
-                verts + stride * elts[i3])
+                verts + stride * (elts[i0] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i2] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i3] & ~DRAW_PIPE_FLAG_MASK))
 
 /* emit last quad vertex as last vertex in triangles */
 #define QUAD_LAST_PV(i0,i1,i2,i3)               \
@@ -193,15 +193,15 @@
                 ( DRAW_PIPE_RESET_STIPPLE |     \
                   DRAW_PIPE_EDGE_FLAG_0 |       \
                   DRAW_PIPE_EDGE_FLAG_2 ),      \
-                verts + stride * elts[i0],      \
-                verts + stride * elts[i1],      \
-                verts + stride * elts[i3]);     \
+                verts + stride * (elts[i0] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i1] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i3] & ~DRAW_PIPE_FLAG_MASK));    \
    do_triangle( draw,                           \
                 ( DRAW_PIPE_EDGE_FLAG_0 |       \
                   DRAW_PIPE_EDGE_FLAG_1 ),      \
-                verts + stride * elts[i1],      \
-                verts + stride * elts[i2],      \
-                verts + stride * elts[i3])
+                verts + stride * (elts[i1] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i2] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i3] & ~DRAW_PIPE_FLAG_MASK))
 
 #define TRIANGLE(flags,i0,i1,i2)                                        \
    do_triangle( draw,                                                   \
@@ -218,7 +218,7 @@
 
 #define POINT(i0)                               \
    do_point( draw,                              \
-             verts + stride * elts[i0] )
+             verts + stride * (elts[i0] & ~DRAW_PIPE_FLAG_MASK) )
 
 #define FUNC pipe_run
 #define ARGS                                    \
@@ -256,27 +256,34 @@
  * draw_vbuf.c code uses when it has to perform a flush.
  */
 void draw_pipeline_run( struct draw_context *draw,
-                        unsigned prim,
-                        struct vertex_header *vertices,
-                        unsigned vertex_count,
-                        unsigned stride,
-                        const ushort *elts,
-                        unsigned count )
+                        const struct draw_vertex_info *vert_info,
+                        const struct draw_prim_info *prim_info)
 {
-   char *verts = (char *)vertices;
+   unsigned i, start;
+   
+   draw->pipeline.verts = (char *)vert_info->verts;
+   draw->pipeline.vertex_stride = vert_info->stride;
+   draw->pipeline.vertex_count = vert_info->count;
 
-   draw->pipeline.verts = verts;
-   draw->pipeline.vertex_stride = stride;
-   draw->pipeline.vertex_count = vertex_count;
-   
-   pipe_run(draw, prim, vertices, stride, elts, count);
-   
+   for (start = i = 0;
+        i < prim_info->primitive_count;
+        start += prim_info->primitive_lengths[i], i++)
+   {
+      unsigned count = prim_info->primitive_lengths[i];
+
+      pipe_run(draw,
+               prim_info->prim,
+               vert_info->verts,
+               vert_info->stride,
+               prim_info->elts + start,
+               count);
+   }
+
    draw->pipeline.verts = NULL;
    draw->pipeline.vertex_count = 0;
 }
 
 
-
 /*
  * Set up macros for draw_pt_decompose.h template code.
  * This code is for non-indexed (aka linear) rendering (no elts).
@@ -289,14 +296,14 @@
                   DRAW_PIPE_EDGE_FLAG_0 |                        \
                   DRAW_PIPE_EDGE_FLAG_1 ),                       \
                 verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK),  \
-                verts + stride * (i1),                           \
-                verts + stride * (i2));                          \
+                verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK),  \
+                verts + stride * ((i2) & ~DRAW_PIPE_FLAG_MASK)); \
    do_triangle( draw,                                            \
                 ( DRAW_PIPE_EDGE_FLAG_1 |                        \
                   DRAW_PIPE_EDGE_FLAG_2 ),                       \
                 verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK),  \
-                verts + stride * (i2),                           \
-                verts + stride * (i3))
+                verts + stride * ((i2) & ~DRAW_PIPE_FLAG_MASK),  \
+                verts + stride * ((i3) & ~DRAW_PIPE_FLAG_MASK))
 
 /* emit last quad vertex as last vertex in triangles */
 #define QUAD_LAST_PV(i0,i1,i2,i3)                                \
@@ -305,31 +312,31 @@
                   DRAW_PIPE_EDGE_FLAG_0 |                        \
                   DRAW_PIPE_EDGE_FLAG_2 ),                       \
                 verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK),  \
-                verts + stride * (i1),                           \
-                verts + stride * (i3));                          \
+                verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK),  \
+                verts + stride * ((i3) & ~DRAW_PIPE_FLAG_MASK)); \
    do_triangle( draw,                                            \
                 ( DRAW_PIPE_EDGE_FLAG_0 |                        \
                   DRAW_PIPE_EDGE_FLAG_1 ),                       \
                 verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK),  \
-                verts + stride * (i2),                           \
-                verts + stride * (i3))
+                verts + stride * ((i2) & ~DRAW_PIPE_FLAG_MASK),  \
+                verts + stride * ((i3) & ~DRAW_PIPE_FLAG_MASK))
 
 #define TRIANGLE(flags,i0,i1,i2)                                 \
    do_triangle( draw,                                            \
                 flags,  /* flags */                              \
                 verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK),  \
-                verts + stride * (i1),                           \
-                verts + stride * (i2))
+                verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK),  \
+                verts + stride * ((i2) & ~DRAW_PIPE_FLAG_MASK))
 
 #define LINE(flags,i0,i1)                                   \
    do_line( draw,                                           \
             flags,                                          \
             verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \
-            verts + stride * (i1))
+            verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK))
 
 #define POINT(i0)                               \
    do_point( draw,                              \
-             verts + stride * i0 )
+             verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK) )
 
 #define FUNC pipe_run_linear
 #define ARGS                                    \
@@ -354,17 +361,29 @@
  * For drawing non-indexed primitives.
  */
 void draw_pipeline_run_linear( struct draw_context *draw,
-                               unsigned prim,
-                               struct vertex_header *vertices,
-                               unsigned count,
-                               unsigned stride )
+                               const struct draw_vertex_info *vert_info,
+                               const struct draw_prim_info *prim_info)
 {
-   char *verts = (char *)vertices;
-   draw->pipeline.verts = verts;
-   draw->pipeline.vertex_stride = stride;
-   draw->pipeline.vertex_count = count;
+   unsigned i, start;
 
-   pipe_run_linear(draw, prim, vertices, stride, count);
+   for (start = i = 0;
+        i < prim_info->primitive_count;
+        start += prim_info->primitive_lengths[i], i++)
+   {
+      unsigned count = prim_info->primitive_lengths[i];
+      char *verts = ((char*)vert_info->verts) +
+                    (start * vert_info->stride);
+
+      draw->pipeline.verts = verts;
+      draw->pipeline.vertex_stride = vert_info->stride;
+      draw->pipeline.vertex_count = count;
+
+      pipe_run_linear(draw,
+                      prim_info->prim,
+                      (struct vertex_header*)verts,
+                      vert_info->stride,
+                      count);
+   }
 
    draw->pipeline.verts = NULL;
    draw->pipeline.vertex_count = 0;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 4faf0a7..debd17f 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -788,9 +788,6 @@
    if (aaline == NULL)
       return NULL;
 
-   if (!draw_alloc_temp_verts( &aaline->stage, 8 ))
-      goto fail;
-
    aaline->stage.draw = draw;
    aaline->stage.name = "aaline";
    aaline->stage.next = NULL;
@@ -801,11 +798,14 @@
    aaline->stage.reset_stipple_counter = aaline_reset_stipple_counter;
    aaline->stage.destroy = aaline_destroy;
 
+   if (!draw_alloc_temp_verts( &aaline->stage, 8 ))
+      goto fail;
+
    return aaline;
 
  fail:
    if (aaline)
-      aaline_destroy(&aaline->stage);
+      aaline->stage.destroy(&aaline->stage);
 
    return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
index bba6f50..d406a86 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -780,9 +780,6 @@
    if (aapoint == NULL)
       goto fail;
 
-   if (!draw_alloc_temp_verts( &aapoint->stage, 4 ))
-      goto fail;
-
    aapoint->stage.draw = draw;
    aapoint->stage.name = "aapoint";
    aapoint->stage.next = NULL;
@@ -793,11 +790,14 @@
    aapoint->stage.reset_stipple_counter = aapoint_reset_stipple_counter;
    aapoint->stage.destroy = aapoint_destroy;
 
+   if (!draw_alloc_temp_verts( &aapoint->stage, 4 ))
+      goto fail;
+
    return aapoint;
 
  fail:
    if (aapoint)
-      aapoint_destroy(&aapoint->stage);
+      aapoint->stage.destroy(&aapoint->stage);
 
    return NULL;
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index df8d82e..122b1c7 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -522,9 +522,6 @@
    if (clipper == NULL)
       goto fail;
 
-   if (!draw_alloc_temp_verts( &clipper->stage, MAX_CLIPPED_VERTICES+1 ))
-      goto fail;
-
    clipper->stage.draw = draw;
    clipper->stage.name = "clipper";
    clipper->stage.point = clip_point;
@@ -536,6 +533,9 @@
 
    clipper->plane = draw->plane;
 
+   if (!draw_alloc_temp_verts( &clipper->stage, MAX_CLIPPED_VERTICES+1 ))
+      goto fail;
+
    return &clipper->stage;
 
  fail:
diff --git a/src/gallium/auxiliary/draw/draw_pipe_cull.c b/src/gallium/auxiliary/draw/draw_pipe_cull.c
index bf84ce3..2f4d01d 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_cull.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_cull.c
@@ -129,9 +129,6 @@
    if (cull == NULL)
       goto fail;
 
-   if (!draw_alloc_temp_verts( &cull->stage, 0 ))
-      goto fail;
-
    cull->stage.draw = draw;
    cull->stage.name = "cull";
    cull->stage.next = NULL;
@@ -142,6 +139,9 @@
    cull->stage.reset_stipple_counter = cull_reset_stipple_counter;
    cull->stage.destroy = cull_destroy;
 
+   if (!draw_alloc_temp_verts( &cull->stage, 0 ))
+      goto fail;
+
    return &cull->stage;
 
 fail:
diff --git a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
index 34afb1a..693f289 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
@@ -257,9 +257,6 @@
    if (flatshade == NULL)
       goto fail;
 
-   if (!draw_alloc_temp_verts( &flatshade->stage, 2 ))
-      goto fail;
-
    flatshade->stage.draw = draw;
    flatshade->stage.name = "flatshade";
    flatshade->stage.next = NULL;
@@ -270,6 +267,9 @@
    flatshade->stage.reset_stipple_counter = flatshade_reset_stipple_counter;
    flatshade->stage.destroy = flatshade_destroy;
 
+   if (!draw_alloc_temp_verts( &flatshade->stage, 2 ))
+      goto fail;
+
    return &flatshade->stage;
 
  fail:
diff --git a/src/gallium/auxiliary/draw/draw_pipe_offset.c b/src/gallium/auxiliary/draw/draw_pipe_offset.c
index 8e32194..8afbbfa 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_offset.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_offset.c
@@ -161,9 +161,7 @@
 {
    struct offset_stage *offset = CALLOC_STRUCT(offset_stage);
    if (offset == NULL)
-      return NULL;
-
-   draw_alloc_temp_verts( &offset->stage, 3 );
+      goto fail;
 
    offset->stage.draw = draw;
    offset->stage.name = "offset";
@@ -175,5 +173,14 @@
    offset->stage.reset_stipple_counter = offset_reset_stipple_counter;
    offset->stage.destroy = offset_destroy;
 
+   if (!draw_alloc_temp_verts( &offset->stage, 3 ))
+      goto fail;
+
    return &offset->stage;
+
+fail:
+   if (offset)
+      offset->stage.destroy( &offset->stage );
+
+   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index ef30db0..fff960c 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -607,8 +607,8 @@
 draw_pstip_stage(struct draw_context *draw)
 {
    struct pstip_stage *pstip = CALLOC_STRUCT(pstip_stage);
-
-   draw_alloc_temp_verts( &pstip->stage, 8 );
+   if (pstip == NULL)
+      goto fail;
 
    pstip->stage.draw = draw;
    pstip->stage.name = "pstip";
@@ -620,7 +620,16 @@
    pstip->stage.reset_stipple_counter = pstip_reset_stipple_counter;
    pstip->stage.destroy = pstip_destroy;
 
+   if (!draw_alloc_temp_verts( &pstip->stage, 8 ))
+      goto fail;
+
    return pstip;
+
+fail:
+   if (pstip)
+      pstip->stage.destroy( &pstip->stage );
+
+   return NULL;
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
index 70fbab9..4b3f4e7 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -235,8 +235,8 @@
 struct draw_stage *draw_stipple_stage( struct draw_context *draw )
 {
    struct stipple_stage *stipple = CALLOC_STRUCT(stipple_stage);
-
-   draw_alloc_temp_verts( &stipple->stage, 2 );
+   if (stipple == NULL)
+      goto fail;
 
    stipple->stage.draw = draw;
    stipple->stage.name = "stipple";
@@ -248,5 +248,14 @@
    stipple->stage.flush = stipple_flush;
    stipple->stage.destroy = stipple_destroy;
 
+   if (!draw_alloc_temp_verts( &stipple->stage, 2 ))
+      goto fail;
+
    return &stipple->stage;
+
+fail:
+   if (stipple)
+      stipple->stage.destroy( &stipple->stage );
+
+   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_twoside.c b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
index 808b2fb..9a3f3fe 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_twoside.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
@@ -177,9 +177,6 @@
    if (twoside == NULL)
       goto fail;
 
-   if (!draw_alloc_temp_verts( &twoside->stage, 3 ))
-      goto fail;
-
    twoside->stage.draw = draw;
    twoside->stage.name = "twoside";
    twoside->stage.next = NULL;
@@ -190,6 +187,9 @@
    twoside->stage.reset_stipple_counter = twoside_reset_stipple_counter;
    twoside->stage.destroy = twoside_destroy;
 
+   if (!draw_alloc_temp_verts( &twoside->stage, 3 ))
+      goto fail;
+
    return &twoside->stage;
 
  fail:
diff --git a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
index e333d26..d87741b 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
@@ -202,9 +202,6 @@
    if (unfilled == NULL)
       goto fail;
 
-   if (!draw_alloc_temp_verts( &unfilled->stage, 0 ))
-      goto fail;
-
    unfilled->stage.draw = draw;
    unfilled->stage.name = "unfilled";
    unfilled->stage.next = NULL;
@@ -216,6 +213,9 @@
    unfilled->stage.reset_stipple_counter = unfilled_reset_stipple_counter;
    unfilled->stage.destroy = unfilled_destroy;
 
+   if (!draw_alloc_temp_verts( &unfilled->stage, 0 ))
+      goto fail;
+
    return &unfilled->stage;
 
  fail:
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
index d7ac95b..98da9cf 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
@@ -202,8 +202,8 @@
 struct draw_stage *draw_wide_line_stage( struct draw_context *draw )
 {
    struct wideline_stage *wide = CALLOC_STRUCT(wideline_stage);
-
-   draw_alloc_temp_verts( &wide->stage, 4 );
+   if (wide == NULL)
+      goto fail;
 
    wide->stage.draw = draw;
    wide->stage.name = "wide-line";
@@ -215,5 +215,14 @@
    wide->stage.reset_stipple_counter = wideline_reset_stipple_counter;
    wide->stage.destroy = wideline_destroy;
 
+   if (!draw_alloc_temp_verts( &wide->stage, 4 ))
+      goto fail;
+
    return &wide->stage;
+
+fail:
+   if (wide)
+      wide->stage.destroy( &wide->stage );
+
+   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
index a86fe19..3e6e538 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -324,9 +324,6 @@
    if (wide == NULL)
       goto fail;
 
-   if (!draw_alloc_temp_verts( &wide->stage, 4 ))
-      goto fail;
-
    wide->stage.draw = draw;
    wide->stage.name = "wide-point";
    wide->stage.next = NULL;
@@ -337,6 +334,9 @@
    wide->stage.reset_stipple_counter = widepoint_reset_stipple_counter;
    wide->stage.destroy = widepoint_destroy;
 
+   if (!draw_alloc_temp_verts( &wide->stage, 4 ))
+      goto fail;
+
    return &wide->stage;
 
  fail:
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index fe867ff..4584033 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -265,6 +265,34 @@
    void *driver_private;
 };
 
+
+struct draw_fetch_info {
+   boolean linear;
+   unsigned start;
+   const unsigned *elts;
+   unsigned count;
+};
+
+struct draw_vertex_info {
+   struct vertex_header *verts;
+   unsigned vertex_size;
+   unsigned stride;
+   unsigned count;
+};
+
+struct draw_prim_info {
+   boolean linear;
+   unsigned start;
+
+   const ushort *elts;
+   unsigned count;
+
+   unsigned prim;
+   unsigned *primitive_lengths;
+   unsigned primitive_count;
+};
+
+
 /*******************************************************************************
  * Draw common initialization code
  */
@@ -342,18 +370,13 @@
 #define DRAW_PIPE_FLAG_MASK     (0xf<<12)
 
 void draw_pipeline_run( struct draw_context *draw,
-                        unsigned prim,
-                        struct vertex_header *vertices,
-                        unsigned vertex_count,
-                        unsigned stride,
-                        const ushort *elts,
-                        unsigned count );
+                        const struct draw_vertex_info *vert,
+                        const struct draw_prim_info *prim);
 
 void draw_pipeline_run_linear( struct draw_context *draw,
-                               unsigned prim,
-                               struct vertex_header *vertices,
-                               unsigned count,
-                               unsigned stride );
+                               const struct draw_vertex_info *vert,
+                               const struct draw_prim_info *prim);
+
 
 
 
@@ -380,9 +403,4 @@
                              boolean flatshade );
 
 
-int draw_max_output_vertices(struct draw_context *draw,
-                             unsigned pipe_prim,
-                             unsigned count);
-
-
 #endif /* DRAW_PRIVATE_H */
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 02c97fe..6234272 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -69,7 +69,6 @@
    struct draw_pt_front_end *frontend = NULL;
    struct draw_pt_middle_end *middle = NULL;
    unsigned opt = 0;
-   unsigned out_prim = prim;
 
    /* Sanitize primitive length:
     */
@@ -80,18 +79,19 @@
       if (count < first)
          return TRUE;
    }
-   if (draw->gs.geometry_shader) {
-      out_prim = draw->gs.geometry_shader->output_primitive;
-   }
 
    if (!draw->force_passthrough) {
+      unsigned gs_out_prim = (draw->gs.geometry_shader ? 
+                              draw->gs.geometry_shader->output_primitive :
+                              prim);
+
       if (!draw->render) {
          opt |= PT_PIPELINE;
       }
 
       if (draw_need_pipeline(draw,
                              draw->rasterizer,
-                             out_prim)) {
+                             gs_out_prim)) {
          opt |= PT_PIPELINE;
       }
 
@@ -122,7 +122,7 @@
       frontend = draw->pt.front.varray;
    }
 
-   frontend->prepare( frontend, prim, out_prim, middle, opt );
+   frontend->prepare( frontend, prim, middle, opt );
 
    frontend->run(frontend,
                  draw_pt_elt_func(draw),
diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h
index 67ae70f..44356fb 100644
--- a/src/gallium/auxiliary/draw/draw_pt.h
+++ b/src/gallium/auxiliary/draw/draw_pt.h
@@ -39,6 +39,8 @@
 
 struct draw_pt_middle_end;
 struct draw_context;
+struct draw_prim_info;
+struct draw_vertex_info;
 
 
 #define PT_SHADE      0x1
@@ -60,8 +62,7 @@
  */
 struct draw_pt_front_end {
    void (*prepare)( struct draw_pt_front_end *,
-                    unsigned input_prim,
-                    unsigned output_prim,
+                    unsigned prim,
                     struct draw_pt_middle_end *,
 		    unsigned opt );
 
@@ -85,8 +86,7 @@
  */
 struct draw_pt_middle_end {
    void (*prepare)( struct draw_pt_middle_end *,
-                    unsigned input_prim,
-                    unsigned output_prim,
+                    unsigned prim,
 		    unsigned opt,
                     unsigned *max_vertices );
 
@@ -164,16 +164,12 @@
                            unsigned *max_vertices );
 
 void draw_pt_emit( struct pt_emit *emit,
-		   const float (*vertex_data)[4],
-		   unsigned vertex_count,
-		   unsigned stride,
-		   const ushort *elts,
-		   unsigned count );
+                   const struct draw_vertex_info *vert_info,
+                   const struct draw_prim_info *prim_info);
 
 void draw_pt_emit_linear( struct pt_emit *emit,
-                          const float (*vertex_data)[4],
-                          unsigned stride,
-                          unsigned count );
+                          const struct draw_vertex_info *vert_info,
+                          const struct draw_prim_info *prim_info);
 
 void draw_pt_emit_destroy( struct pt_emit *emit );
 
@@ -184,13 +180,11 @@
  */
 struct pt_so_emit;
 
-void draw_pt_so_emit_prepare( struct pt_so_emit *emit,
-                              unsigned prim );
+void draw_pt_so_emit_prepare( struct pt_so_emit *emit );
 
 void draw_pt_so_emit( struct pt_so_emit *emit,
-                      const float (*vertex_data)[4],
-                      unsigned vertex_count,
-                      unsigned stride );
+                      const struct draw_vertex_info *vert_info,
+                      const struct draw_prim_info *prim_info );
 
 void draw_pt_so_emit_destroy( struct pt_so_emit *emit );
 
@@ -226,9 +220,7 @@
 struct pt_post_vs;
 
 boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
-			     struct vertex_header *pipeline_verts,
-			     unsigned stride,
-			     unsigned count );
+			     struct draw_vertex_info *info );
 
 void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
 			      boolean bypass_clipping,
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index f623c07..0229bcc 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -127,15 +127,17 @@
 
 
 void draw_pt_emit( struct pt_emit *emit,
-		   const float (*vertex_data)[4],
-		   unsigned vertex_count,
-		   unsigned stride,
-		   const ushort *elts,
-		   unsigned count )
+                         const struct draw_vertex_info *vert_info,
+                         const struct draw_prim_info *prim_info)
 {
+   const float (*vertex_data)[4] = (const float (*)[4])vert_info->verts->data;
+   unsigned vertex_count = vert_info->count;
+   unsigned stride = vert_info->stride;
+   const ushort *elts = prim_info->elts;
    struct draw_context *draw = emit->draw;
    struct translate *translate = emit->translate;
    struct vbuf_render *render = draw->render;
+   unsigned start, i;
    void *hw_verts;
 
    /* XXX: need to flush to get prim_vbuf.c to release its allocation?? 
@@ -190,23 +192,31 @@
                            0, 
                            vertex_count - 1 );
 
-   render->draw_elements(render,
-                         elts,
-                         count);
+   for (start = i = 0;
+        i < prim_info->primitive_count;
+        start += prim_info->primitive_lengths[i], i++)
+   {
+      render->draw_elements(render,
+                            elts + start,
+                            prim_info->primitive_lengths[i]);
+   }
 
    render->release_vertices(render);
 }
 
 
 void draw_pt_emit_linear(struct pt_emit *emit,
-                         const float (*vertex_data)[4],
-                         unsigned stride,
-                         unsigned count)
+                         const struct draw_vertex_info *vert_info,
+                         const struct draw_prim_info *prim_info)
 {
+   const float (*vertex_data)[4] = (const float (*)[4])vert_info->verts->data;
+   unsigned stride = vert_info->stride;
+   unsigned count = vert_info->count;
    struct draw_context *draw = emit->draw;
    struct translate *translate = emit->translate;
    struct vbuf_render *render = draw->render;
    void *hw_verts;
+   unsigned start, i;
 
 #if 0
    debug_printf("Linear emit\n");
@@ -258,7 +268,14 @@
 
    render->unmap_vertices( render, 0, count - 1 );
 
-   render->draw_arrays(render, 0, count);
+   for (start = i = 0;
+        i < prim_info->primitive_count;
+        start += prim_info->primitive_lengths[i], i++)
+   {
+      render->draw_arrays(render,
+                          start,
+                          prim_info->primitive_lengths[i]);
+   }
 
    render->release_vertices(render);
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c
index a134722..bf799db 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c
@@ -29,7 +29,6 @@
 #include "util/u_math.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
-#include "draw/draw_vbuf.h"
 #include "draw/draw_pt.h"
 #include "translate/translate.h"
 #include "translate/translate_cache.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index c629d55..5c8af17 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -36,6 +36,7 @@
 #include "draw/draw_vbuf.h"
 #include "draw/draw_vertex.h"
 #include "draw/draw_pt.h"
+#include "draw/draw_gs.h"
 #include "translate/translate.h"
 #include "translate/translate_cache.h"
 
@@ -90,7 +91,6 @@
 
 static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
                                 unsigned prim,
-                                unsigned out_prim,
 				unsigned opt,
                                 unsigned *max_vertices )
 {
@@ -101,9 +101,14 @@
    boolean ok;
    struct translate_key key;
 
+   unsigned gs_out_prim = (draw->gs.geometry_shader ? 
+                           draw->gs.geometry_shader->output_primitive :
+                           prim);
+
+
 
    ok = draw->render->set_primitive( draw->render, 
-                                     out_prim );
+                                     gs_out_prim );
    if (!ok) {
       assert(0);
       return;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index 5483a25..b827028 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -68,8 +68,7 @@
 
 
 static void fse_prepare( struct draw_pt_middle_end *middle,
-                         unsigned in_prim,
-                         unsigned out_prim,
+                         unsigned prim,
                          unsigned opt,
                          unsigned *max_vertices )
 {
@@ -80,9 +79,12 @@
    unsigned i;
    unsigned nr_vbs = 0;
 
+   /* Can't support geometry shader on this path.
+    */
+   assert(!draw->gs.geometry_shader);
 
    if (!draw->render->set_primitive( draw->render,
-                                     out_prim )) {
+                                     prim )) {
       assert(0);
       return;
    }
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index 2301e54..24c538b 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -48,13 +48,11 @@
    unsigned vertex_data_offset;
    unsigned vertex_size;
    unsigned input_prim;
-   unsigned output_prim;
    unsigned opt;
 };
 
 static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
-                                    unsigned in_prim,
-                                    unsigned out_prim,
+                                    unsigned prim,
 				    unsigned opt,
                                     unsigned *max_vertices )
 {
@@ -64,6 +62,10 @@
    unsigned i;
    unsigned instance_id_index = ~0;
 
+   unsigned gs_out_prim = (draw->gs.geometry_shader ? 
+                           draw->gs.geometry_shader->output_primitive :
+                           prim);
+
    /* Add one to num_outputs because the pipeline occasionally tags on
     * an additional texcoord, eg for AA lines.
     */
@@ -79,8 +81,7 @@
       }
    }
 
-   fpme->input_prim = in_prim;
-   fpme->output_prim = out_prim;
+   fpme->input_prim = prim;
    fpme->opt = opt;
 
    /* Always leave room for the vertex header whether we need it or
@@ -102,13 +103,13 @@
 			    (boolean)draw->bypass_clipping,
 			    (boolean)draw->identity_viewport,
 			    (boolean)draw->rasterizer->gl_rasterization_rules,
-			    (draw->vs.edgeflag_output ? true : false) );
+			    (draw->vs.edgeflag_output ? TRUE : FALSE) );
 
-   draw_pt_so_emit_prepare( fpme->so_emit, out_prim );
+   draw_pt_so_emit_prepare( fpme->so_emit );
 
    if (!(opt & PT_PIPELINE)) {
       draw_pt_emit_prepare( fpme->emit,
-			    out_prim,
+			    gs_out_prim,
                             max_vertices );
 
       *max_vertices = MAX2( *max_vertices,
@@ -127,6 +128,164 @@
 }
 
 
+static void fetch( struct pt_fetch *fetch,
+                   const struct draw_fetch_info *fetch_info,
+                   char *output)
+{
+   if (fetch_info->linear) {
+      draw_pt_fetch_run_linear( fetch,
+                                fetch_info->start,
+                                fetch_info->count,
+                                output );
+   }
+   else {
+      draw_pt_fetch_run( fetch,
+                         fetch_info->elts,
+                         fetch_info->count,
+                         output );
+   }
+}
+
+
+static void pipeline(struct fetch_pipeline_middle_end *fpme,
+                     const struct draw_vertex_info *vert_info,
+                     const struct draw_prim_info *prim_info)
+{
+   if (prim_info->linear)
+      draw_pipeline_run_linear( fpme->draw,
+                                vert_info,
+                                prim_info);
+   else
+      draw_pipeline_run( fpme->draw,
+                         vert_info,
+                         prim_info );
+}
+
+static void emit(struct pt_emit *emit,
+                 const struct draw_vertex_info *vert_info,
+                 const struct draw_prim_info *prim_info)
+{
+   if (prim_info->linear) {
+      draw_pt_emit_linear(emit, vert_info, prim_info);
+   }
+   else {
+      draw_pt_emit(emit, vert_info, prim_info);
+   }
+}
+
+
+static void draw_vertex_shader_run(struct draw_vertex_shader *vshader,
+                                   const void *constants[PIPE_MAX_CONSTANT_BUFFERS], 
+                                   const struct draw_vertex_info *input_verts,
+                                   struct draw_vertex_info *output_verts )
+{
+   output_verts->vertex_size = input_verts->vertex_size;
+   output_verts->stride = input_verts->vertex_size;
+   output_verts->count = input_verts->count;
+   output_verts->verts =
+      (struct vertex_header *)MALLOC(output_verts->vertex_size *
+                                     output_verts->count);
+
+   vshader->run_linear(vshader,
+                       (const float (*)[4])input_verts->verts->data,
+                       (      float (*)[4])output_verts->verts->data,
+                       constants,
+                       input_verts->count,
+                       input_verts->vertex_size,
+                       input_verts->vertex_size);
+}
+
+static void fetch_pipeline_generic( struct draw_pt_middle_end *middle,
+                                    const struct draw_fetch_info *fetch_info,
+                                    const struct draw_prim_info *prim_info )
+{
+   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
+   struct draw_context *draw = fpme->draw;
+   struct draw_vertex_shader *vshader = draw->vs.vertex_shader;
+   struct draw_geometry_shader *gshader = draw->gs.geometry_shader;
+   struct draw_prim_info gs_prim_info;
+   struct draw_vertex_info fetched_vert_info;
+   struct draw_vertex_info vs_vert_info;
+   struct draw_vertex_info gs_vert_info;
+   struct draw_vertex_info *vert_info;
+   unsigned opt = fpme->opt;
+
+   fetched_vert_info.count = fetch_info->count;
+   fetched_vert_info.vertex_size = fpme->vertex_size;
+   fetched_vert_info.stride = fpme->vertex_size;
+   fetched_vert_info.verts =
+      (struct vertex_header *)MALLOC(fpme->vertex_size *
+                                     align(fetch_info->count,  4));
+   if (!fetched_vert_info.verts) {
+      assert(0);
+      return;
+   }
+
+   /* Fetch into our vertex buffer.
+    */
+   fetch( fpme->fetch, fetch_info, (char *)fetched_vert_info.verts );
+
+   /* Finished with fetch:
+    */
+   fetch_info = NULL;
+   vert_info = &fetched_vert_info;
+
+   /* Run the shader, note that this overwrites the data[] parts of
+    * the pipeline verts.
+    */
+   if (fpme->opt & PT_SHADE) {
+      draw_vertex_shader_run(vshader,
+                             draw->pt.user.vs_constants,
+                             vert_info,
+                             &vs_vert_info);
+
+      FREE(vert_info->verts);
+      vert_info = &vs_vert_info;
+   }
+
+   if ((fpme->opt & PT_SHADE) && gshader) {
+      draw_geometry_shader_run(gshader,
+                               draw->pt.user.gs_constants,
+                               vert_info,
+                               prim_info,
+                               &gs_vert_info,
+                               &gs_prim_info);
+
+      FREE(vert_info->verts);
+      vert_info = &gs_vert_info;
+      prim_info = &gs_prim_info;
+   }
+
+
+   /* Stream output needs to be done before clipping.
+    *
+    * XXX: Stream output surely needs to respect the prim_info->elt
+    *      lists.
+    */
+   draw_pt_so_emit( fpme->so_emit,
+                    vert_info,
+                    prim_info );
+
+   if (draw_pt_post_vs_run( fpme->post_vs,
+                            vert_info ))
+   {
+      opt |= PT_PIPELINE;
+   }
+
+   /* Do we need to run the pipeline?
+    */
+   if (opt & PT_PIPELINE) {
+      pipeline( fpme,
+                vert_info,
+                prim_info );
+   }
+   else {
+      emit( fpme->emit,
+            vert_info,
+            prim_info );
+   }
+   FREE(vert_info->verts);
+}
 
 static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
                                 const unsigned *fetch_elts,
@@ -135,94 +294,23 @@
                                 unsigned draw_count )
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
-   struct draw_context *draw = fpme->draw;
-   struct draw_vertex_shader *vshader = draw->vs.vertex_shader;
-   struct draw_geometry_shader *gshader = draw->gs.geometry_shader;
-   unsigned opt = fpme->opt;
-   struct vertex_header *pipeline_verts;
-   unsigned alloc_count = draw_max_output_vertices(draw,
-                                                   fpme->input_prim,
-                                                   fetch_count);
+   struct draw_fetch_info fetch_info;
+   struct draw_prim_info prim_info;
 
-   pipeline_verts =
-      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
+   fetch_info.linear = FALSE;
+   fetch_info.start = 0;
+   fetch_info.elts = fetch_elts;
+   fetch_info.count = fetch_count;
 
-   if (!pipeline_verts) {
-      /* Not much we can do here - just skip the rendering.
-       */
-      assert(0);
-      return;
-   }
+   prim_info.linear = FALSE;
+   prim_info.start = 0;
+   prim_info.count = draw_count;
+   prim_info.elts = draw_elts;
+   prim_info.prim = fpme->input_prim;
+   prim_info.primitive_count = 1;
+   prim_info.primitive_lengths = &draw_count;
 
-   /* Fetch into our vertex buffer
-    */
-   draw_pt_fetch_run( fpme->fetch,
-		      fetch_elts, 
-		      fetch_count,
-		      (char *)pipeline_verts );
-
-   /* Run the shader, note that this overwrites the data[] parts of
-    * the pipeline verts.
-    */
-   if (opt & PT_SHADE)
-   {
-      vshader->run_linear(vshader,
-                          (const float (*)[4])pipeline_verts->data,
-                          (      float (*)[4])pipeline_verts->data,
-                          draw->pt.user.vs_constants,
-                          fetch_count,
-                          fpme->vertex_size,
-                          fpme->vertex_size);
-      if (gshader) {
-         fetch_count =
-            draw_geometry_shader_run(gshader,
-                                     fpme->input_prim,
-                                     (const float (*)[4])pipeline_verts->data,
-                                     (      float (*)[4])pipeline_verts->data,
-                                     draw->pt.user.gs_constants,
-                                     fetch_count,
-                                     fpme->vertex_size,
-                                     fpme->vertex_size);
-         debug_assert(fetch_count <= alloc_count);
-      }
-   }
-
-   /* stream output needs to be done before clipping */
-   draw_pt_so_emit( fpme->so_emit,
-		    (const float (*)[4])pipeline_verts->data,
-		    fetch_count,
-		    fpme->vertex_size );
-
-   if (draw_pt_post_vs_run( fpme->post_vs,
-			    pipeline_verts,
-			    fetch_count,
-			    fpme->vertex_size ))
-   {
-      opt |= PT_PIPELINE;
-   }
-
-   /* Do we need to run the pipeline?
-    */
-   if (opt & PT_PIPELINE) {
-      draw_pipeline_run( fpme->draw,
-                         fpme->output_prim,
-                         pipeline_verts,
-                         fetch_count,
-                         fpme->vertex_size,
-                         draw_elts,
-                         draw_count );
-   }
-   else {
-      draw_pt_emit( fpme->emit,
-		    (const float (*)[4])pipeline_verts->data,
-		    fetch_count,
-		    fpme->vertex_size,
-		    draw_elts,
-		    draw_count );
-   }
-
-
-   FREE(pipeline_verts);
+   fetch_pipeline_generic( middle, &fetch_info, &prim_info );
 }
 
 
@@ -231,185 +319,52 @@
                                        unsigned count)
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
-   struct draw_context *draw = fpme->draw;
-   struct draw_vertex_shader *shader = draw->vs.vertex_shader;
-   struct draw_geometry_shader *geometry_shader = draw->gs.geometry_shader;
-   unsigned opt = fpme->opt;
-   struct vertex_header *pipeline_verts;
-   unsigned alloc_count = draw_max_output_vertices(draw,
-                                                   fpme->input_prim,
-                                                   count);
+   struct draw_fetch_info fetch_info;
+   struct draw_prim_info prim_info;
 
-   pipeline_verts =
-      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
+   fetch_info.linear = TRUE;
+   fetch_info.start = start;
+   fetch_info.count = count;
+   fetch_info.elts = NULL;
 
-   if (!pipeline_verts) {
-      /* Not much we can do here - just skip the rendering.
-       */
-      assert(0);
-      return;
-   }
+   prim_info.linear = TRUE;
+   prim_info.start = 0;
+   prim_info.count = count;
+   prim_info.elts = NULL;
+   prim_info.prim = fpme->input_prim;
+   prim_info.primitive_count = 1;
+   prim_info.primitive_lengths = &count;
 
-   /* Fetch into our vertex buffer
-    */
-   draw_pt_fetch_run_linear( fpme->fetch,
-                             start,
-                             count,
-                             (char *)pipeline_verts );
-
-   /* Run the shader, note that this overwrites the data[] parts of
-    * the pipeline verts.
-    */
-   if (opt & PT_SHADE)
-   {
-      shader->run_linear(shader,
-			 (const float (*)[4])pipeline_verts->data,
-			 (      float (*)[4])pipeline_verts->data,
-                         draw->pt.user.vs_constants,
-			 count,
-			 fpme->vertex_size,
-			 fpme->vertex_size);
-
-      if (geometry_shader) {
-         count =
-            draw_geometry_shader_run(geometry_shader,
-                                     fpme->input_prim,
-                                     (const float (*)[4])pipeline_verts->data,
-                                     (      float (*)[4])pipeline_verts->data,
-                                     draw->pt.user.gs_constants,
-                                     count,
-                                     fpme->vertex_size,
-                                     fpme->vertex_size);
-         debug_assert(count <= alloc_count);
-      }
-   }
-
-   /* stream output needs to be done before clipping */
-   draw_pt_so_emit( fpme->so_emit,
-		    (const float (*)[4])pipeline_verts->data,
-		    count,
-		    fpme->vertex_size );
-
-   if (draw_pt_post_vs_run( fpme->post_vs,
-			    pipeline_verts,
-			    count,
-			    fpme->vertex_size ))
-   {
-      opt |= PT_PIPELINE;
-   }
-
-   /* Do we need to run the pipeline?
-    */
-   if (opt & PT_PIPELINE) {
-      draw_pipeline_run_linear( fpme->draw,
-                                fpme->output_prim,
-                                pipeline_verts,
-                                count,
-                                fpme->vertex_size);
-   }
-   else {
-      draw_pt_emit_linear( fpme->emit,
-                           (const float (*)[4])pipeline_verts->data,
-                           fpme->vertex_size,
-                           count );
-   }
-
-   FREE(pipeline_verts);
+   fetch_pipeline_generic( middle, &fetch_info, &prim_info );
 }
 
 
 
 static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle,
-                                            unsigned start,
-                                            unsigned count,
-                                            const ushort *draw_elts,
-                                            unsigned draw_count )
+                                               unsigned start,
+                                               unsigned count,
+                                               const ushort *draw_elts,
+                                               unsigned draw_count )
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
-   struct draw_context *draw = fpme->draw;
-   struct draw_vertex_shader *shader = draw->vs.vertex_shader;
-   struct draw_geometry_shader *geometry_shader = draw->gs.geometry_shader;
-   unsigned opt = fpme->opt;
-   struct vertex_header *pipeline_verts;
-   unsigned alloc_count = draw_max_output_vertices(draw,
-                                                   fpme->input_prim,
-                                                   count);
+   struct draw_fetch_info fetch_info;
+   struct draw_prim_info prim_info;
 
-   pipeline_verts =
-      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
+   fetch_info.linear = TRUE;
+   fetch_info.start = start;
+   fetch_info.count = count;
+   fetch_info.elts = NULL;
 
-   if (!pipeline_verts)
-      return FALSE;
+   prim_info.linear = FALSE;
+   prim_info.start = 0;
+   prim_info.count = draw_count;
+   prim_info.elts = draw_elts;
+   prim_info.prim = fpme->input_prim;
+   prim_info.primitive_count = 1;
+   prim_info.primitive_lengths = &draw_count;
 
-   /* Fetch into our vertex buffer
-    */
-   draw_pt_fetch_run_linear( fpme->fetch,
-                             start,
-                             count,
-                             (char *)pipeline_verts );
+   fetch_pipeline_generic( middle, &fetch_info, &prim_info );
 
-   /* Run the shader, note that this overwrites the data[] parts of
-    * the pipeline verts.
-    */
-   if (opt & PT_SHADE)
-   {
-      shader->run_linear(shader,
-			 (const float (*)[4])pipeline_verts->data,
-			 (      float (*)[4])pipeline_verts->data,
-                         draw->pt.user.vs_constants,
-			 count,
-			 fpme->vertex_size,
-			 fpme->vertex_size);
-
-      if (geometry_shader) {
-         count =
-            draw_geometry_shader_run(geometry_shader,
-                                     fpme->input_prim,
-                                     (const float (*)[4])pipeline_verts->data,
-                                     (      float (*)[4])pipeline_verts->data,
-                                     draw->pt.user.gs_constants,
-                                     count,
-                                     fpme->vertex_size,
-                                     fpme->vertex_size);
-         debug_assert(count <= alloc_count);
-      }
-   }
-
-   /* stream output needs to be done before clipping */
-   draw_pt_so_emit( fpme->so_emit,
-		    (const float (*)[4])pipeline_verts->data,
-		    count,
-		    fpme->vertex_size );
-
-   if (draw_pt_post_vs_run( fpme->post_vs,
-			    pipeline_verts,
-			    count,
-			    fpme->vertex_size ))
-   {
-      opt |= PT_PIPELINE;
-   }
-
-   /* Do we need to run the pipeline?
-    */
-   if (opt & PT_PIPELINE) {
-      draw_pipeline_run( fpme->draw,
-                         fpme->output_prim,
-                         pipeline_verts,
-                         count,
-                         fpme->vertex_size,
-                         draw_elts,
-                         draw_count );
-   }
-   else {
-      draw_pt_emit( fpme->emit,
-		    (const float (*)[4])pipeline_verts->data,
-		    count,
-		    fpme->vertex_size,
-		    draw_elts,
-		    draw_count );
-   }
-
-   FREE(pipeline_verts);
    return TRUE;
 }
 
@@ -464,7 +419,7 @@
       goto fail;
 
    fpme->emit = draw_pt_emit_create( draw );
-   if (!fpme->emit) 
+   if (!fpme->emit)
       goto fail;
 
    fpme->so_emit = draw_pt_so_emit_create( draw );
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index 5f6d238..c7f7639 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -28,6 +28,7 @@
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "draw/draw_context.h"
+#include "draw/draw_gs.h"
 #include "draw/draw_vbuf.h"
 #include "draw/draw_vertex.h"
 #include "draw/draw_pt.h"
@@ -48,7 +49,6 @@
    unsigned vertex_data_offset;
    unsigned vertex_size;
    unsigned input_prim;
-   unsigned output_prim;
    unsigned opt;
 
    struct draw_llvm *llvm;
@@ -61,7 +61,6 @@
 static void
 llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
                          unsigned in_prim,
-                         unsigned out_prim,
                          unsigned opt,
                          unsigned *max_vertices )
 {
@@ -73,6 +72,11 @@
    unsigned i;
    unsigned instance_id_index = ~0;
 
+
+   unsigned out_prim = (draw->gs.geometry_shader ? 
+                        draw->gs.geometry_shader->output_primitive :
+                        in_prim);
+
    /* Add one to num_outputs because the pipeline occasionally tags on
     * an additional texcoord, eg for AA lines.
     */
@@ -89,7 +93,6 @@
    }
 
    fpme->input_prim = in_prim;
-   fpme->output_prim = out_prim;
    fpme->opt = opt;
 
    /* Always leave room for the vertex header whether we need it or
@@ -106,9 +109,10 @@
 			    (boolean)draw->bypass_clipping,
 			    (boolean)(draw->identity_viewport),
 			    (boolean)draw->rasterizer->gl_rasterization_rules,
-			    (draw->vs.edgeflag_output ? true : false) );
+			    (draw->vs.edgeflag_output ? TRUE : FALSE) );
 
-   draw_pt_so_emit_prepare( fpme->so_emit, out_prim );
+   draw_pt_so_emit_prepare( fpme->so_emit );
+
    if (!(opt & PT_PIPELINE)) {
       draw_pt_emit_prepare( fpme->emit,
 			    out_prim,
@@ -150,6 +154,117 @@
 }
 
 
+static void pipeline(struct llvm_middle_end *llvm,
+                     const struct draw_vertex_info *vert_info,
+                     const struct draw_prim_info *prim_info)
+{
+   if (prim_info->linear)
+      draw_pipeline_run_linear( llvm->draw,
+                                vert_info,
+                                prim_info);
+   else
+      draw_pipeline_run( llvm->draw,
+                         vert_info,
+                         prim_info );
+}
+
+static void emit(struct pt_emit *emit,
+                 const struct draw_vertex_info *vert_info,
+                 const struct draw_prim_info *prim_info)
+{
+   if (prim_info->linear) {
+      draw_pt_emit_linear(emit, vert_info, prim_info);
+   }
+   else {
+      draw_pt_emit(emit, vert_info, prim_info);
+   }
+}
+
+static void
+llvm_pipeline_generic( struct draw_pt_middle_end *middle,
+                       const struct draw_fetch_info *fetch_info,
+                       const struct draw_prim_info *prim_info )
+{
+   struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
+   struct draw_context *draw = fpme->draw;
+   struct draw_geometry_shader *gshader = draw->gs.geometry_shader;
+   struct draw_prim_info gs_prim_info;
+   struct draw_vertex_info llvm_vert_info;
+   struct draw_vertex_info gs_vert_info;
+   struct draw_vertex_info *vert_info;
+   unsigned opt = fpme->opt;
+
+   llvm_vert_info.count = fetch_info->count;
+   llvm_vert_info.vertex_size = fpme->vertex_size;
+   llvm_vert_info.stride = fpme->vertex_size;
+   llvm_vert_info.verts =
+      (struct vertex_header *)MALLOC(fpme->vertex_size *
+                                     align(fetch_info->count,  4));
+   if (!llvm_vert_info.verts) {
+      assert(0);
+      return;
+   }
+
+   if (fetch_info->linear)
+      fpme->current_variant->jit_func( &fpme->llvm->jit_context,
+                                       llvm_vert_info.verts,
+                                       (const char **)draw->pt.user.vbuffer,
+                                       fetch_info->start,
+                                       fetch_info->count,
+                                       fpme->vertex_size,
+                                       draw->pt.vertex_buffer );
+   else
+      fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context,
+                                            llvm_vert_info.verts,
+                                            (const char **)draw->pt.user.vbuffer,
+                                            fetch_info->elts,
+                                            fetch_info->count,
+                                            fpme->vertex_size,
+                                            draw->pt.vertex_buffer);
+
+   /* Finished with fetch and vs:
+    */
+   fetch_info = NULL;
+   vert_info = &llvm_vert_info;
+
+
+   if ((opt & PT_SHADE) && gshader) {
+      draw_geometry_shader_run(gshader,
+                               draw->pt.user.gs_constants,
+                               vert_info,
+                               prim_info,
+                               &gs_vert_info,
+                               &gs_prim_info);
+
+      FREE(vert_info->verts);
+      vert_info = &gs_vert_info;
+      prim_info = &gs_prim_info;
+   }
+
+   /* stream output needs to be done before clipping */
+   draw_pt_so_emit( fpme->so_emit,
+		    vert_info,
+                    prim_info );
+
+   if (draw_pt_post_vs_run( fpme->post_vs, vert_info )) {
+      opt |= PT_PIPELINE;
+   }
+
+   /* Do we need to run the pipeline?
+    */
+   if (opt & PT_PIPELINE) {
+      pipeline( fpme,
+                vert_info,
+                prim_info );
+   }
+   else {
+      emit( fpme->emit,
+            vert_info,
+            prim_info );
+   }
+   FREE(vert_info->verts);
+}
+
 
 static void llvm_middle_end_run( struct draw_pt_middle_end *middle,
                                  const unsigned *fetch_elts,
@@ -158,64 +273,23 @@
                                  unsigned draw_count )
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
-   struct draw_context *draw = fpme->draw;
-   unsigned opt = fpme->opt;
-   unsigned alloc_count = align( fetch_count, 4 );
+   struct draw_fetch_info fetch_info;
+   struct draw_prim_info prim_info;
 
-   struct vertex_header *pipeline_verts =
-      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
+   fetch_info.linear = FALSE;
+   fetch_info.start = 0;
+   fetch_info.elts = fetch_elts;
+   fetch_info.count = fetch_count;
 
-   if (!pipeline_verts) {
-      /* Not much we can do here - just skip the rendering.
-       */
-      assert(0);
-      return;
-   }
+   prim_info.linear = FALSE;
+   prim_info.start = 0;
+   prim_info.count = draw_count;
+   prim_info.elts = draw_elts;
+   prim_info.prim = fpme->input_prim;
+   prim_info.primitive_count = 1;
+   prim_info.primitive_lengths = &draw_count;
 
-   fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context,
-                                         pipeline_verts,
-                                         (const char **)draw->pt.user.vbuffer,
-                                         fetch_elts,
-                                         fetch_count,
-                                         fpme->vertex_size,
-                                         draw->pt.vertex_buffer );
-
-   /* stream output needs to be done before clipping */
-   draw_pt_so_emit( fpme->so_emit,
-		    (const float (*)[4])pipeline_verts->data,
-		    fetch_count,
-		    fpme->vertex_size );
-
-   if (draw_pt_post_vs_run( fpme->post_vs,
-			    pipeline_verts,
-			    fetch_count,
-			    fpme->vertex_size ))
-   {
-      opt |= PT_PIPELINE;
-   }
-
-   /* Do we need to run the pipeline?
-    */
-   if (opt & PT_PIPELINE) {
-      draw_pipeline_run( fpme->draw,
-                         fpme->output_prim,
-                         pipeline_verts,
-                         fetch_count,
-                         fpme->vertex_size,
-                         draw_elts,
-                         draw_count );
-   }
-   else {
-      draw_pt_emit( fpme->emit,
-		    (const float (*)[4])pipeline_verts->data,
-		    fetch_count,
-		    fpme->vertex_size,
-		    draw_elts,
-		    draw_count );
-   }
-
-
-   FREE(pipeline_verts);
+   llvm_pipeline_generic( middle, &fetch_info, &prim_info );
 }
 
 
@@ -224,63 +298,23 @@
                                        unsigned count)
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
-   struct draw_context *draw = fpme->draw;
-   unsigned opt = fpme->opt;
-   unsigned alloc_count = align( count, 4 );
+   struct draw_fetch_info fetch_info;
+   struct draw_prim_info prim_info;
 
-   struct vertex_header *pipeline_verts =
-      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
+   fetch_info.linear = TRUE;
+   fetch_info.start = start;
+   fetch_info.count = count;
+   fetch_info.elts = NULL;
 
-   if (!pipeline_verts) {
-      /* Not much we can do here - just skip the rendering.
-       */
-      assert(0);
-      return;
-   }
+   prim_info.linear = TRUE;
+   prim_info.start = 0;
+   prim_info.count = count;
+   prim_info.elts = NULL;
+   prim_info.prim = fpme->input_prim;
+   prim_info.primitive_count = 1;
+   prim_info.primitive_lengths = &count;
 
-#if 0
-   debug_printf("#### Pipeline = %p (data = %p)\n",
-                pipeline_verts, pipeline_verts->data);
-#endif
-   fpme->current_variant->jit_func( &fpme->llvm->jit_context,
-                                    pipeline_verts,
-                                    (const char **)draw->pt.user.vbuffer,
-                                    start,
-                                    count,
-                                    fpme->vertex_size,
-                                    draw->pt.vertex_buffer );
-
-   /* stream output needs to be done before clipping */
-   draw_pt_so_emit( fpme->so_emit,
-		    (const float (*)[4])pipeline_verts->data,
-		    count,
-		    fpme->vertex_size );
-
-   if (draw_pt_post_vs_run( fpme->post_vs,
-			    pipeline_verts,
-			    count,
-			    fpme->vertex_size ))
-   {
-      opt |= PT_PIPELINE;
-   }
-
-   /* Do we need to run the pipeline?
-    */
-   if (opt & PT_PIPELINE) {
-      draw_pipeline_run_linear( fpme->draw,
-                                fpme->output_prim,
-                                pipeline_verts,
-                                count,
-                                fpme->vertex_size);
-   }
-   else {
-      draw_pt_emit_linear( fpme->emit,
-                           (const float (*)[4])pipeline_verts->data,
-                           fpme->vertex_size,
-                           count );
-   }
-
-   FREE(pipeline_verts);
+   llvm_pipeline_generic( middle, &fetch_info, &prim_info );
 }
 
 
@@ -293,59 +327,24 @@
                                  unsigned draw_count )
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
-   struct draw_context *draw = fpme->draw;
-   unsigned opt = fpme->opt;
-   unsigned alloc_count = align( count, 4 );
+   struct draw_fetch_info fetch_info;
+   struct draw_prim_info prim_info;
 
-   struct vertex_header *pipeline_verts =
-      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
+   fetch_info.linear = TRUE;
+   fetch_info.start = start;
+   fetch_info.count = count;
+   fetch_info.elts = NULL;
 
-   if (!pipeline_verts)
-      return FALSE;
+   prim_info.linear = FALSE;
+   prim_info.start = 0;
+   prim_info.count = draw_count;
+   prim_info.elts = draw_elts;
+   prim_info.prim = fpme->input_prim;
+   prim_info.primitive_count = 1;
+   prim_info.primitive_lengths = &draw_count;
 
-   fpme->current_variant->jit_func( &fpme->llvm->jit_context,
-                                    pipeline_verts,
-                                    (const char **)draw->pt.user.vbuffer,
-                                    start,
-                                    count,
-                                    fpme->vertex_size,
-                                    draw->pt.vertex_buffer );
+   llvm_pipeline_generic( middle, &fetch_info, &prim_info );
 
-   /* stream output needs to be done before clipping */
-   draw_pt_so_emit( fpme->so_emit,
-		    (const float (*)[4])pipeline_verts->data,
-		    count,
-		    fpme->vertex_size );
-
-   if (draw_pt_post_vs_run( fpme->post_vs,
-			    pipeline_verts,
-			    count,
-			    fpme->vertex_size ))
-   {
-      opt |= PT_PIPELINE;
-   }
-
-   /* Do we need to run the pipeline?
-    */
-   if (opt & PT_PIPELINE) {
-      draw_pipeline_run( fpme->draw,
-                         fpme->output_prim,
-                         pipeline_verts,
-                         count,
-                         fpme->vertex_size,
-                         draw_elts,
-                         draw_count );
-   }
-   else {
-      draw_pt_emit( fpme->emit,
-		    (const float (*)[4])pipeline_verts->data,
-		    count,
-		    fpme->vertex_size,
-		    draw_elts,
-		    draw_count );
-   }
-
-   FREE(pipeline_verts);
    return TRUE;
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
index fd33a54..112be50 100644
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -29,16 +29,13 @@
 #include "pipe/p_context.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
-#include "draw/draw_vbuf.h"
 #include "draw/draw_pt.h"
 
 struct pt_post_vs {
    struct draw_context *draw;
 
    boolean (*run)( struct pt_post_vs *pvs,
-		struct vertex_header *vertices,
-		unsigned count,
-		unsigned stride );
+                   struct draw_vertex_info *info );
 };
 
 
@@ -92,20 +89,18 @@
  * instructions
  */
 static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
-					  struct vertex_header *vertices,
-					  unsigned count,
-					  unsigned stride )
+                                             struct draw_vertex_info *info )
 {
-   struct vertex_header *out = vertices;
+   struct vertex_header *out = info->verts;
    const float *scale = pvs->draw->viewport.scale;
    const float *trans = pvs->draw->viewport.translate;
    const unsigned pos = draw_current_shader_position_output(pvs->draw);
    unsigned clipped = 0;
    unsigned j;
 
-   if (0) debug_printf("%s count, %d\n", __FUNCTION__, count);
+   if (0) debug_printf("%s count, %d\n", __FUNCTION__, info->count);
 
-   for (j = 0; j < count; j++) {
+   for (j = 0; j < info->count; j++) {
       float *position = out->data[pos];
 
 #if 0
@@ -143,7 +138,7 @@
 #endif
       }
 
-      out = (struct vertex_header *)( (char *)out + stride );
+      out = (struct vertex_header *)( (char *)out + info->stride );
    }
 
    return clipped != 0;
@@ -153,29 +148,27 @@
 
 /* As above plus edgeflags
  */
-static boolean 
+static boolean
 post_vs_cliptest_viewport_gl_edgeflag(struct pt_post_vs *pvs,
-                                      struct vertex_header *vertices,
-                                      unsigned count,
-                                      unsigned stride )
+                                      struct draw_vertex_info *info)
 {
    unsigned j;
    boolean needpipe;
 
-   needpipe = post_vs_cliptest_viewport_gl( pvs, vertices, count, stride);
+   needpipe = post_vs_cliptest_viewport_gl(pvs, info);
 
    /* If present, copy edgeflag VS output into vertex header.
     * Otherwise, leave header as is.
     */
    if (pvs->draw->vs.edgeflag_output) {
-      struct vertex_header *out = vertices;
+      struct vertex_header *out = info->verts;
       int ef = pvs->draw->vs.edgeflag_output;
 
-      for (j = 0; j < count; j++) {
+      for (j = 0; j < info->count; j++) {
          const float *edgeflag = out->data[ef];
          out->edgeflag = !(edgeflag[0] != 1.0f);
          needpipe |= !out->edgeflag;
-         out = (struct vertex_header *)( (char *)out + stride );
+         out = (struct vertex_header *)( (char *)out + info->stride );
       }
    }
    return needpipe;
@@ -187,18 +180,16 @@
 /* If bypass_clipping is set, skip cliptest and rhw divide.
  */
 static boolean post_vs_viewport( struct pt_post_vs *pvs,
-			      struct vertex_header *vertices,
-			      unsigned count,
-			      unsigned stride )
+                                 struct draw_vertex_info *info )
 {
-   struct vertex_header *out = vertices;
+   struct vertex_header *out = info->verts;
    const float *scale = pvs->draw->viewport.scale;
    const float *trans = pvs->draw->viewport.translate;
    const unsigned pos = draw_current_shader_position_output(pvs->draw);
    unsigned j;
 
    if (0) debug_printf("%s\n", __FUNCTION__);
-   for (j = 0; j < count; j++) {
+   for (j = 0; j < info->count; j++) {
       float *position = out->data[pos];
 
       /* Viewport mapping only, no cliptest/rhw divide
@@ -207,9 +198,9 @@
       position[1] = position[1] * scale[1] + trans[1];
       position[2] = position[2] * scale[2] + trans[2];
 
-      out = (struct vertex_header *)((char *)out + stride);
+      out = (struct vertex_header *)((char *)out + info->stride);
    }
-   
+
    return FALSE;
 }
 
@@ -218,20 +209,16 @@
  * to do.
  */
 static boolean post_vs_none( struct pt_post_vs *pvs,
-			     struct vertex_header *vertices,
-			     unsigned count,
-			     unsigned stride )
+			     struct draw_vertex_info *info )
 {
    if (0) debug_printf("%s\n", __FUNCTION__);
    return FALSE;
 }
 
 boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
-			     struct vertex_header *pipeline_verts,
-			     unsigned count,
-			     unsigned stride )
+			     struct draw_vertex_info *info )
 {
-   return pvs->run( pvs, pipeline_verts, count, stride );
+   return pvs->run( pvs, info );
 }
 
 
@@ -272,7 +259,7 @@
       return NULL;
 
    pvs->draw = draw;
-   
+
    return pvs;
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
index bb153ce..5d82934 100644
--- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
@@ -25,151 +25,264 @@
  *
  **************************************************************************/
 
-#include "util/u_memory.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vbuf.h"
 #include "draw/draw_vertex.h"
 #include "draw/draw_pt.h"
-#include "translate/translate.h"
-#include "translate/translate_cache.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 struct pt_so_emit {
    struct draw_context *draw;
 
-   struct translate *translate;
+   void *buffers[PIPE_MAX_SO_BUFFERS];
 
-   struct translate_cache *cache;
-   unsigned prim;
+   unsigned input_vertex_stride;
+   const float (*inputs)[4];
 
-   const struct vertex_info *vinfo;
    boolean has_so;
+
+   boolean single_buffer;
+
+   unsigned emitted_primitives;
+   unsigned emitted_vertices;
 };
 
-static void
-prepare_so_emit( struct pt_so_emit *emit,
-                 const struct vertex_info *vinfo )
+
+void draw_pt_so_emit_prepare(struct pt_so_emit *emit)
 {
    struct draw_context *draw = emit->draw;
-   unsigned i;
-   struct translate_key hw_key;
-   unsigned dst_offset = 0;
-
-   if (emit->has_so) {
-      for (i = 0; i < draw->so.state.num_outputs; ++i) {
-         unsigned src_offset = (draw->so.state.register_index[i] * 4 *
-                                sizeof(float) );
-         unsigned output_format;
-         unsigned emit_sz = 0;
-         /*unsigned output_bytes = util_format_get_blocksize(output_format);
-           unsigned nr_compo = util_format_get_nr_components(output_format);*/
-
-         output_format = draw_translate_vinfo_format(vinfo->attrib[i].emit);
-         emit_sz = draw_translate_vinfo_size(vinfo->attrib[i].emit);
-
-         /* doesn't handle EMIT_OMIT */
-         assert(emit_sz != 0);
-
-         if (draw->so.state.register_mask[i] != TGSI_WRITEMASK_XYZW) {
-            /* we only support rendering with XYZW writemask*/
-            debug_printf("NOT_IMPLEMENTED(writemask with stream output) at %s: %s:%d\n",
-                         __FUNCTION__, __FILE__, __LINE__);
-         }
-
-         hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
-         hw_key.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
-         hw_key.element[i].input_buffer = 0;
-         hw_key.element[i].input_offset = src_offset;
-         hw_key.element[i].instance_divisor = 0;
-         hw_key.element[i].output_format = output_format;
-         hw_key.element[i].output_offset = dst_offset;
-
-         dst_offset += emit_sz;
-      }
-      hw_key.nr_elements = draw->so.state.num_outputs;
-      hw_key.output_stride = draw->so.state.stride;
-
-      if (!emit->translate ||
-          translate_key_compare(&emit->translate->key, &hw_key) != 0)
-      {
-         translate_key_sanitize(&hw_key);
-         emit->translate = translate_cache_find(emit->cache, &hw_key);
-      }
-   } else {
-      /* no stream output */
-      emit->translate = NULL;
-   }
-}
-
-
-void draw_pt_so_emit_prepare( struct pt_so_emit *emit,
-                              unsigned prim )
-{
-   struct draw_context *draw = emit->draw;
-   boolean ok;
 
    emit->has_so = (draw->so.state.num_outputs > 0);
 
+   /* if we have a state with outputs make sure we have
+    * buffers to output to */
+   if (emit->has_so) {
+      boolean has_valid_buffer = FALSE;
+      unsigned i;
+      for (i = 0; i < draw->so.num_buffers; ++i) {
+         if (draw->so.buffers[i]) {
+            has_valid_buffer = TRUE;
+            break;
+         }
+      }
+      emit->has_so = has_valid_buffer;
+   }
+
    if (!emit->has_so)
       return;
 
    /* XXX: need to flush to get prim_vbuf.c to release its allocation??
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+}
 
-   emit->prim = prim;
-
-   ok = draw->render->set_primitive(draw->render, emit->prim);
-   if (!ok) {
-      assert(0);
-      return;
+static boolean
+is_component_writable(unsigned mask,
+                      unsigned compo)
+{
+   switch (mask) {
+   case TGSI_WRITEMASK_NONE:
+      return FALSE;
+   case TGSI_WRITEMASK_X:
+      return compo == 0;
+   case TGSI_WRITEMASK_Y:
+      return compo == 1;
+   case TGSI_WRITEMASK_XY:
+      return compo == 0 || compo == 1;
+   case TGSI_WRITEMASK_Z:
+      return compo == 2;
+   case TGSI_WRITEMASK_XZ:
+      return compo == 0 || compo == 2;
+   case TGSI_WRITEMASK_YZ:
+      return compo == 1 || compo == 2;
+   case TGSI_WRITEMASK_XYZ:
+      return compo == 0 || compo == 1 || compo == 2;
+   case TGSI_WRITEMASK_W:
+      return compo == 3;
+   case TGSI_WRITEMASK_XW:
+      return compo == 0 || compo == 3;
+   case TGSI_WRITEMASK_YW:
+      return compo == 1 || compo == 3;
+   case TGSI_WRITEMASK_XYW:
+      return compo == 0 || compo == 1 || compo == 3;
+   case TGSI_WRITEMASK_ZW:
+      return compo == 2 || compo == 3;
+   case TGSI_WRITEMASK_XZW:
+      return compo == 0 || compo == 1 || compo == 3;
+   case TGSI_WRITEMASK_YZW:
+      return compo == 1 || compo == 2 || compo == 4;
+   case TGSI_WRITEMASK_XYZW:
+      return compo < 4;
+   default:
+      debug_assert(!"Unknown writemask in stream out");
+      return compo < 4;
    }
+}
 
-   /* Must do this after set_primitive() above: */
-   emit->vinfo = draw->render->get_vertex_info(draw->render);
+static void so_emit_prim(struct pt_so_emit *so,
+                         unsigned *indices,
+                         unsigned num_vertices)
+{
+   unsigned slot, i;
+   unsigned input_vertex_stride = so->input_vertex_stride;
+   struct draw_context *draw = so->draw;
+   const float (*input_ptr)[4];
+   const struct pipe_stream_output_state *state =
+      &draw->so.state;
+   float **buffer = 0;
 
-   prepare_so_emit( emit, emit->vinfo );
+   input_ptr = so->inputs;
+
+   for (i = 0; i < num_vertices; ++i) {
+      const float (*input)[4];
+      unsigned total_written_compos = 0;
+      /*debug_printf("%d) vertex index = %d (prim idx = %d)\n", i, indices[i], prim_idx);*/
+      input = (const float (*)[4])(
+         (const char *)input_ptr + (indices[i] * input_vertex_stride));
+      for (slot = 0; slot < state->num_outputs; ++slot) {
+         unsigned idx = state->register_index[slot];
+         unsigned writemask = state->register_mask[slot];
+         unsigned written_compos = 0;
+         unsigned compo;
+
+         buffer = (float**)&so->buffers[state->output_buffer[slot]];
+
+         /*debug_printf("\tSlot = %d, vs_slot = %d, idx = %d:\n",
+           slot, vs_slot, idx);*/
+#if 1
+         assert(!util_is_inf_or_nan(input[idx][0]));
+         assert(!util_is_inf_or_nan(input[idx][1]));
+         assert(!util_is_inf_or_nan(input[idx][2]));
+         assert(!util_is_inf_or_nan(input[idx][3]));
+#endif
+         for (compo = 0; compo < 4; ++compo) {
+            if (is_component_writable(writemask, compo)) {
+               float *buf = *buffer;
+               buf[written_compos++] = input[idx][compo];
+            }
+         }
+#if 0
+         debug_printf("\t\t(writemask = %d)%f %f %f %f\n",
+                      writemask,
+                      input[idx][0],
+                      input[idx][1],
+                      input[idx][2],
+                      input[idx][3]);
+#endif
+         *buffer += written_compos;
+         total_written_compos += written_compos;
+      }
+      if (so->single_buffer) {
+         int stride = (int)state->stride -
+                      sizeof(float) * total_written_compos;
+
+         debug_assert(stride >= 0);
+         *buffer = (float*) (((char*)*buffer) + stride);
+      }
+   }
+   so->emitted_vertices += num_vertices;
+   ++so->emitted_primitives;
+}
+
+static void so_point(struct pt_so_emit *so, int idx)
+{
+   unsigned indices[1];
+
+   indices[0] = idx;
+
+   so_emit_prim(so, indices, 1);
+}
+
+static void so_line(struct pt_so_emit *so, int i0, int i1)
+{
+   unsigned indices[2];
+
+   indices[0] = i0;
+   indices[1] = i1;
+
+   so_emit_prim(so, indices, 2);
+}
+
+static void so_tri(struct pt_so_emit *so, int i0, int i1, int i2)
+{
+   unsigned indices[3];
+
+   indices[0] = i0;
+   indices[1] = i1;
+   indices[2] = i2;
+
+   so_emit_prim(so, indices, 3);
 }
 
 
+#define TRIANGLE(gs,i0,i1,i2) so_tri(so,i0,i1,i2)
+#define LINE(gs,i0,i1)        so_line(so,i0,i1)
+#define POINT(gs,i0)          so_point(so,i0)
+#define FUNC so_run_linear
+#define LOCAL_VARS
+#include "draw_so_emit_tmp.h"
+#undef LOCAL_VARS
+#undef FUNC
+
+
+#define TRIANGLE(gs,i0,i1,i2) so_tri(gs,elts[i0],elts[i1],elts[i2])
+#define LINE(gs,i0,i1)        so_line(gs,elts[i0],elts[i1])
+#define POINT(gs,i0)          so_point(gs,elts[i0])
+#define FUNC so_run_elts
+#define LOCAL_VARS                         \
+   const ushort *elts = input_prims->elts;
+#include "draw_so_emit_tmp.h"
+#undef LOCAL_VARS
+#undef FUNC
+
+
 void draw_pt_so_emit( struct pt_so_emit *emit,
-		   const float (*vertex_data)[4],
-		   unsigned vertex_count,
-		   unsigned stride )
+                      const struct draw_vertex_info *input_verts,
+                      const struct draw_prim_info *input_prims )
 {
    struct draw_context *draw = emit->draw;
-   struct translate *translate = emit->translate;
    struct vbuf_render *render = draw->render;
-   void *so_buffer;
+   unsigned start, i;
 
    if (!emit->has_so)
       return;
 
-   so_buffer = draw->so.buffers[0];
+   emit->emitted_vertices = 0;
+   emit->emitted_primitives = 0;
+   emit->input_vertex_stride = input_verts->stride;
+   emit->inputs = (const float (*)[4])input_verts->verts->data;
+   for (i = 0; i < draw->so.num_buffers; ++i) {
+      emit->buffers[i] = draw->so.buffers[i];
+   }
+   emit->single_buffer = TRUE;
+   for (i = 0; i < draw->so.state.num_outputs; ++i) {
+      if (draw->so.state.output_buffer[i] != 0)
+         emit->single_buffer = FALSE;
+   }
 
    /* XXX: need to flush to get prim_vbuf.c to release its allocation??*/
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (vertex_count == 0)
-      return;
+   for (start = i = 0; i < input_prims->primitive_count;
+        start += input_prims->primitive_lengths[i], i++)
+   {
+      unsigned count = input_prims->primitive_lengths[i];
 
-   if (vertex_count >= UNDEFINED_VERTEX_ID) {
-      assert(0);
-      return;
+      if (input_prims->linear) {
+         so_run_linear(emit, input_prims, input_verts,
+                       start, count);
+      } else {
+         so_run_elts(emit, input_prims, input_verts,
+                     start, count);
+      }
    }
 
-   /* XXX we only support single output buffer */
-   if (draw->so.num_buffers != 1) {
-      debug_printf("NOT_IMPLEMENTED(multiple stream output buffers) at %s: %s:%d\n",
-                   __FUNCTION__, __FILE__, __LINE__);
-   }
-
-   translate->set_buffer(translate, 0, vertex_data,
-                         stride, ~0);
-   translate->run(translate, 0, vertex_count,
-                  draw->instance_id, so_buffer);
-
-   render->set_stream_output_info(render, 0, vertex_count);
+   render->set_stream_output_info(render,
+                                  emit->emitted_primitives,
+                                  emit->emitted_vertices);
 }
 
 
@@ -180,19 +293,11 @@
       return NULL;
 
    emit->draw = draw;
-   emit->cache = translate_cache_create();
-   if (!emit->cache) {
-      FREE(emit);
-      return NULL;
-   }
 
    return emit;
 }
 
 void draw_pt_so_emit_destroy( struct pt_so_emit *emit )
 {
-   if (emit->cache)
-      translate_cache_destroy(emit->cache);
-
    FREE(emit);
 }
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c
index 5ea8330..d89d5cd 100644
--- a/src/gallium/auxiliary/draw/draw_pt_varray.c
+++ b/src/gallium/auxiliary/draw/draw_pt_varray.c
@@ -137,7 +137,6 @@
 
 static void varray_prepare(struct draw_pt_front_end *frontend,
                            unsigned in_prim,
-                           unsigned out_prim,
                            struct draw_pt_middle_end *middle,
                            unsigned opt)
 {
@@ -146,11 +145,12 @@
    varray->base.run = varray_run;
 
    varray->input_prim = in_prim;
-   varray->output_prim = decompose_prim[out_prim];
+   varray->output_prim = decompose_prim[in_prim];
 
    varray->middle = middle;
-   middle->prepare(middle, varray->input_prim,
-                   varray->output_prim, opt, &varray->driver_fetch_max );
+   middle->prepare(middle,
+                   varray->output_prim,
+                   opt, &varray->driver_fetch_max );
 
    /* check that the max is even */
    assert((varray->driver_fetch_max & 1) == 0);
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c
index 914c87a..b7e0da7d 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vcache.c
+++ b/src/gallium/auxiliary/draw/draw_pt_vcache.c
@@ -70,7 +70,6 @@
    if (vcache->middle_prim != vcache->output_prim) {
       vcache->middle_prim = vcache->output_prim;
       vcache->middle->prepare( vcache->middle,
-                               vcache->input_prim,
                                vcache->middle_prim,
                                vcache->opt,
                                &vcache->fetch_max );
@@ -368,7 +367,6 @@
    if (vcache->middle_prim != vcache->input_prim) {
       vcache->middle_prim = vcache->input_prim;
       vcache->middle->prepare( vcache->middle,
-                               vcache->input_prim,
                                vcache->middle_prim,
                                vcache->opt,
                                &vcache->fetch_max );
@@ -472,7 +470,6 @@
 static void
 vcache_prepare( struct draw_pt_front_end *frontend,
                 unsigned in_prim,
-                unsigned out_prim,
                 struct draw_pt_middle_end *middle,
                 unsigned opt )
 {
@@ -487,8 +484,14 @@
       vcache->base.run = vcache_check_run;
    }
 
+   /* VCache will always emit the reduced version of its input
+    * primitive, ie STRIP/FANS become TRIS, etc.
+    *
+    * This is not to be confused with what the GS might be up to,
+    * which is a separate issue.
+    */
    vcache->input_prim = in_prim;
-   vcache->output_prim = u_reduced_prim(out_prim);
+   vcache->output_prim = u_reduced_prim(in_prim);
 
    vcache->middle = middle;
    vcache->opt = opt;
@@ -497,8 +500,9 @@
     * doing so:
     */
    vcache->middle_prim = (opt & PT_PIPELINE) ? vcache->output_prim : vcache->input_prim;
-   middle->prepare( middle, vcache->input_prim,
-                    vcache->middle_prim, opt, &vcache->fetch_max );
+   middle->prepare( middle,
+                    vcache->middle_prim,
+                    opt, &vcache->fetch_max );
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h
new file mode 100644
index 0000000..01212a8
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h
@@ -0,0 +1,123 @@
+
+static void FUNC( struct pt_so_emit *so,
+                  const struct draw_prim_info *input_prims,
+                  const struct draw_vertex_info *input_verts,
+                  unsigned start,
+                  unsigned count)
+{
+   struct draw_context *draw = so->draw;
+
+   boolean flatfirst = (draw->rasterizer->flatshade &&
+                        draw->rasterizer->flatshade_first);
+   unsigned i;
+   LOCAL_VARS
+
+   if (0) debug_printf("%s %d\n", __FUNCTION__, count);
+
+   debug_assert(input_prims->primitive_count == 1);
+
+   switch (input_prims->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < count; i++) {
+	 POINT( so, start + i + 0 );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 0; i+1 < count; i += 2) {
+         LINE( so , start + i + 0 , start + i + 1 );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      if (count >= 2) {
+
+         for (i = 1; i < count; i++) {
+            LINE( so, start + i - 1, start + i );
+         }
+
+	 LINE( so, start + i - 1, start );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < count; i++) {
+         LINE( so, start + i - 1, start + i );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 0; i+2 < count; i += 3) {
+         TRIANGLE( so, start + i + 0, start + i + 1, start + i + 2 );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (flatfirst) {
+         for (i = 0; i+2 < count; i++) {
+            TRIANGLE( so,
+                      start + i + 0,
+                      start + i + 1 + (i&1),
+                      start + i + 2 - (i&1) );
+         }
+      }
+      else {
+         for (i = 0; i+2 < count; i++) {
+            TRIANGLE( so,
+                      start + i + 0 + (i&1),
+                      start + i + 1 - (i&1),
+                      start + i + 2 );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (count >= 3) {
+         if (flatfirst) {
+            for (i = 0; i+2 < count; i++) {
+               TRIANGLE( so,
+                         start + i + 1,
+                         start + i + 2,
+                         start );
+            }
+         }
+         else {
+            for (i = 0; i+2 < count; i++) {
+               TRIANGLE( so,
+                         start,
+                         start + i + 1,
+                         start + i + 2 );
+            }
+         }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      {
+         /* These bitflags look a little odd because we submit the
+          * vertices as (1,2,0) to satisfy flatshade requirements.
+          */
+
+	 for (i = 0; i+2 < count; i++) {
+
+            if (flatfirst) {
+               TRIANGLE( so, start + 0, start + i + 1, start + i + 2 );
+            }
+            else {
+               TRIANGLE( so, start + i + 1, start + i + 2, start + 0 );
+            }
+	 }
+      }
+      break;
+
+   default:
+      debug_assert(!"Unsupported primitive in stream output");
+      break;
+   }
+}
+
+
+#undef TRIANGLE
+#undef POINT
+#undef LINE
+#undef FUNC
diff --git a/src/gallium/auxiliary/draw/draw_vbuf.h b/src/gallium/auxiliary/draw/draw_vbuf.h
index 8d97682..e32803c 100644
--- a/src/gallium/auxiliary/draw/draw_vbuf.h
+++ b/src/gallium/auxiliary/draw/draw_vbuf.h
@@ -123,7 +123,7 @@
     * Called after writing data to the stream out buffers
     */
    void (*set_stream_output_info)( struct vbuf_render *vbufr,
-                                   unsigned buffer_index,
+                                   unsigned primitive_count,
                                    unsigned vertices_count );
 };
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index eb49204..87e3e72 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -310,21 +310,6 @@
 }
 
 
-typedef void (*fetch_func)(float *, const uint8_t *, unsigned, unsigned);
-
-/** cast wrapper */
-static void *
-fetch_func_ptr_to_voidptr(fetch_func f)
-{
-   union {
-      void *v;
-      fetch_func f;
-   } u;
-   u.f = f;
-   return u.v;
-}
-
-
 /**
  * Fetch a pixel into a 4 float AoS.
  *
@@ -406,7 +391,7 @@
          assert(LLVMIsDeclaration(function));
 
          LLVMAddGlobalMapping(lp_build_engine, function,
-                    fetch_func_ptr_to_voidptr(format_desc->fetch_rgba_float));
+                              func_to_pointer((func_pointer)format_desc->fetch_rgba_float));
       }
 
       tmp = lp_build_alloca(builder, LLVMVectorType(LLVMFloatType(), 4), "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 0a690ea..44cfdc4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -78,6 +78,9 @@
 extern void
 lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
 
+extern void
+lp_set_target_options(void);
+
 
 void
 lp_build_init(void)
@@ -86,6 +89,8 @@
    gallivm_debug = debug_get_flags_option("GALLIVM_DEBUG", lp_bld_debug_flags, 0 );
 #endif
 
+   lp_set_target_options();
+
    LLVMInitializeNativeTarget();
 
    LLVMLinkInJIT();
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index f004c0a..5a9488b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -36,6 +36,7 @@
 
 #include <llvm-c/Core.h>
 #include <llvm-c/ExecutionEngine.h>
+#include <llvm/Target/TargetOptions.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
 #include <llvm/ExecutionEngine/JITEventListener.h>
 
@@ -119,3 +120,25 @@
 {
    llvm::unwrap(EE)->RegisterJITEventListener(llvm::createOProfileJITEventListener());
 }
+
+
+extern "C" void
+lp_set_target_options(void)
+{
+#if defined(DEBUG)
+#if HAVE_LLVM >= 0x0207
+   llvm::JITEmitDebugInfo = true;
+#endif
+#endif
+
+#if defined(DEBUG) || defined(PROFILE)
+   llvm::NoFramePointerElim = true;
+#endif
+
+   llvm::NoExcessFPPrecision = false;
+
+   /* XXX: Investigate this */
+#if 0
+   llvm::UnsafeFPMath = true;
+#endif
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 0890078..6dbedf1 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -164,6 +164,7 @@
    full_declaration.Declaration  = tgsi_default_declaration();
    full_declaration.Range = tgsi_default_declaration_range();
    full_declaration.Semantic = tgsi_default_declaration_semantic();
+   full_declaration.ImmediateData.u = NULL;
 
    return full_declaration;
 }
@@ -180,7 +181,7 @@
    struct tgsi_declaration_range *dr;
 
    if( maxsize <= size )
-     return 0;
+      return 0;
    declaration = (struct tgsi_declaration *) &tokens[size];
    size++;
 
@@ -235,6 +236,24 @@
          header );
    }
 
+   if (full_decl->Declaration.File == TGSI_FILE_IMMEDIATE_ARRAY) {
+      unsigned i, j;
+      union tgsi_immediate_data *data;
+
+      for (i = 0; i <= dr->Last; ++i) {
+         for (j = 0; j < 4; ++j) {
+            unsigned idx = i*4 + j;
+            if (maxsize <= size)
+               return 0;
+            data = (union tgsi_immediate_data *) &tokens[size];
+            ++size;
+
+            *data = full_decl->ImmediateData.u[idx];
+            declaration_grow( declaration, header );
+         }
+      }
+   }
+
    return size;
 }
 
@@ -613,6 +632,7 @@
          reg->Register.File,
          reg->Register.WriteMask,
          reg->Register.Indirect,
+         reg->Register.Dimension,
          reg->Register.Index,
          instruction,
          header );
@@ -640,6 +660,46 @@
             instruction,
             header );
       }
+
+      if( reg->Register.Dimension ) {
+         struct  tgsi_dimension *dim;
+
+         assert( !reg->Dimension.Dimension );
+
+         if( maxsize <= size )
+            return 0;
+         dim = (struct tgsi_dimension *) &tokens[size];
+         size++;
+
+         *dim = tgsi_build_dimension(
+            reg->Dimension.Indirect,
+            reg->Dimension.Index,
+            instruction,
+            header );
+
+         if( reg->Dimension.Indirect ) {
+            struct tgsi_src_register *ind;
+
+            if( maxsize <= size )
+               return 0;
+            ind = (struct tgsi_src_register *) &tokens[size];
+            size++;
+
+            *ind = tgsi_build_src_register(
+               reg->DimIndirect.File,
+               reg->DimIndirect.SwizzleX,
+               reg->DimIndirect.SwizzleY,
+               reg->DimIndirect.SwizzleZ,
+               reg->DimIndirect.SwizzleW,
+               reg->DimIndirect.Negate,
+               reg->DimIndirect.Absolute,
+               reg->DimIndirect.Indirect,
+               reg->DimIndirect.Dimension,
+               reg->DimIndirect.Index,
+               instruction,
+               header );
+         }
+      }
    }
 
    for( i = 0;  i < full_inst->Instruction.NumSrcRegs; i++ ) {
@@ -959,6 +1019,7 @@
    unsigned file,
    unsigned mask,
    unsigned indirect,
+   unsigned dimension,
    int index,
    struct tgsi_instruction *instruction,
    struct tgsi_header *header )
@@ -974,6 +1035,7 @@
    dst_register.WriteMask = mask;
    dst_register.Index = index;
    dst_register.Indirect = indirect;
+   dst_register.Dimension = dimension;
 
    instruction_grow( instruction, header );
 
@@ -987,6 +1049,8 @@
 
    full_dst_register.Register = tgsi_default_dst_register();
    full_dst_register.Indirect = tgsi_default_src_register();
+   full_dst_register.Dimension = tgsi_default_dimension();
+   full_dst_register.DimIndirect = tgsi_default_src_register();
 
    return full_dst_register;
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.h b/src/gallium/auxiliary/tgsi/tgsi_build.h
index 13d7f52..112107a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.h
@@ -263,6 +263,7 @@
    unsigned file,
    unsigned mask,
    unsigned indirect,
+   unsigned dimension,
    int index,
    struct tgsi_instruction *instruction,
    struct tgsi_header *header );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index 3548007..9fcc28f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -101,7 +101,9 @@
    "ADDR",
    "IMM",
    "PRED",
-   "SV"
+   "SV",
+   "IMMX",
+   "TEMPX"
 };
 
 static const char *interpolate_names[] =
@@ -191,29 +193,30 @@
 
 
 static void
-_dump_register_dst(
-   struct dump_ctx *ctx,
-   uint file,
-   int index)
-{
-   ENM( file, file_names );
-
-   CHR( '[' );
-   SID( index );
-   CHR( ']' );
-}
-
-
-static void
 _dump_register_src(
    struct dump_ctx *ctx,
    const struct tgsi_full_src_register *src )
 {
    ENM(src->Register.File, file_names);
    if (src->Register.Dimension) {
-      CHR('[');
-      SID(src->Dimension.Index);
-      CHR(']');
+      if (src->Dimension.Indirect) {
+         CHR( '[' );
+         ENM( src->DimIndirect.File, file_names );
+         CHR( '[' );
+         SID( src->DimIndirect.Index );
+         TXT( "]." );
+         ENM( src->DimIndirect.SwizzleX, swizzle_names );
+         if (src->Dimension.Index != 0) {
+            if (src->Dimension.Index > 0)
+               CHR( '+' );
+            SID( src->Dimension.Index );
+         }
+         CHR( ']' );
+      } else {
+         CHR('[');
+         SID(src->Dimension.Index);
+         CHR(']');
+      }
    }
    if (src->Register.Indirect) {
       CHR( '[' );
@@ -235,31 +238,53 @@
    }
 }
 
-static void
-_dump_register_ind(
-   struct dump_ctx *ctx,
-   uint file,
-   int index,
-   uint ind_file,
-   int ind_index,
-   uint ind_swizzle )
-{
-   ENM( file, file_names );
-   CHR( '[' );
-   ENM( ind_file, file_names );
-   CHR( '[' );
-   SID( ind_index );
-   TXT( "]." );
-   ENM( ind_swizzle, swizzle_names );
-   if (index != 0) {
-      if (index > 0)
-         CHR( '+' );
-      SID( index );
-   }
-   CHR( ']' );
-}
 
 static void
+_dump_register_dst(
+   struct dump_ctx *ctx,
+   const struct tgsi_full_dst_register *dst )
+{
+   ENM(dst->Register.File, file_names);
+   if (dst->Register.Dimension) {
+      if (dst->Dimension.Indirect) {
+         CHR( '[' );
+         ENM( dst->DimIndirect.File, file_names );
+         CHR( '[' );
+         SID( dst->DimIndirect.Index );
+         TXT( "]." );
+         ENM( dst->DimIndirect.SwizzleX, swizzle_names );
+         if (dst->Dimension.Index != 0) {
+            if (dst->Dimension.Index > 0)
+               CHR( '+' );
+            SID( dst->Dimension.Index );
+         }
+         CHR( ']' );
+      } else {
+         CHR('[');
+         SID(dst->Dimension.Index);
+         CHR(']');
+      }
+   }
+   if (dst->Register.Indirect) {
+      CHR( '[' );
+      ENM( dst->Indirect.File, file_names );
+      CHR( '[' );
+      SID( dst->Indirect.Index );
+      TXT( "]." );
+      ENM( dst->Indirect.SwizzleX, swizzle_names );
+      if (dst->Register.Index != 0) {
+         if (dst->Register.Index > 0)
+            CHR( '+' );
+         SID( dst->Register.Index );
+      }
+      CHR( ']' );
+   } else {
+      CHR( '[' );
+      SID( dst->Register.Index );
+      CHR( ']' );
+   }
+}
+static void
 _dump_writemask(
    struct dump_ctx *ctx,
    uint writemask )
@@ -277,6 +302,39 @@
    }
 }
 
+static void
+dump_imm_data(struct tgsi_iterate_context *iter,
+              union tgsi_immediate_data *data,
+              unsigned num_tokens,
+              unsigned data_type)
+{
+   struct dump_ctx *ctx = (struct dump_ctx *)iter;
+   unsigned i ;
+
+   TXT( " {" );
+
+   assert( num_tokens <= 4 );
+   for (i = 0; i < num_tokens; i++) {
+      switch (data_type) {
+      case TGSI_IMM_FLOAT32:
+         FLT( data[i].Float );
+         break;
+      case TGSI_IMM_UINT32:
+         UID(data[i].Uint);
+         break;
+      case TGSI_IMM_INT32:
+         SID(data[i].Int);
+         break;
+      default:
+         assert( 0 );
+      }
+
+      if (i < num_tokens - 1)
+         TXT( ", " );
+   }
+   TXT( "}" );
+}
+
 static boolean
 iter_declaration(
    struct tgsi_iterate_context *iter,
@@ -357,6 +415,43 @@
       }
    }
 
+   if (decl->Declaration.File == TGSI_FILE_IMMEDIATE_ARRAY) {
+      unsigned i;
+      char range_indent[4];
+
+      TXT(" {");
+
+      if (decl->Range.Last < 10)
+         range_indent[0] = '\0';
+      else if (decl->Range.Last < 100) {
+         range_indent[0] = ' ';
+         range_indent[1] = '\0';
+      } else if (decl->Range.Last < 1000) {
+         range_indent[0] = ' ';
+         range_indent[1] = ' ';
+         range_indent[2] = '\0';
+      } else {
+         range_indent[0] = ' ';
+         range_indent[1] = ' ';
+         range_indent[2] = ' ';
+         range_indent[3] = '\0';
+      }
+
+      dump_imm_data(iter, decl->ImmediateData.u,
+                    4, TGSI_IMM_FLOAT32);
+      for(i = 1; i <= decl->Range.Last; ++i) {
+         /* indent by strlen of:
+          *   "DCL IMMX[0..1] {" */
+         CHR('\n');
+         TXT( "                " );
+         TXT( range_indent );
+         dump_imm_data(iter, decl->ImmediateData.u + i,
+                       4, TGSI_IMM_FLOAT32);
+      }
+
+      TXT(" }");
+   }
+
    EOL();
 
    return TRUE;
@@ -430,33 +525,11 @@
 {
    struct dump_ctx *ctx = (struct dump_ctx *) iter;
 
-   uint i;
-
    TXT( "IMM " );
    ENM( imm->Immediate.DataType, immediate_type_names );
 
-   TXT( " { " );
-
-   assert( imm->Immediate.NrTokens <= 4 + 1 );
-   for (i = 0; i < imm->Immediate.NrTokens - 1; i++) {
-      switch (imm->Immediate.DataType) {
-      case TGSI_IMM_FLOAT32:
-         FLT( imm->u[i].Float );
-         break;
-      case TGSI_IMM_UINT32:
-         UID(imm->u[i].Uint);
-         break;
-      case TGSI_IMM_INT32:
-         SID(imm->u[i].Int);
-         break;
-      default:
-         assert( 0 );
-      }
-
-      if (i < imm->Immediate.NrTokens - 2)
-         TXT( ", " );
-   }
-   TXT( " }" );
+   dump_imm_data(iter, imm->u, imm->Immediate.NrTokens - 1,
+                 imm->Immediate.DataType);
 
    EOL();
 
@@ -487,12 +560,12 @@
 
    INSTID( instno );
    TXT( ": " );
-   
+
    ctx->indent -= info->pre_dedent;
    for(i = 0; (int)i < ctx->indent; ++i)
       TXT( "  " );
    ctx->indent += info->post_indent;
-   
+
    if (inst->Instruction.Predicate) {
       CHR( '(' );
 
@@ -539,21 +612,7 @@
          CHR( ',' );
       CHR( ' ' );
 
-      if (dst->Register.Indirect) {
-         _dump_register_ind(
-            ctx,
-            dst->Register.File,
-            dst->Register.Index,
-            dst->Indirect.File,
-            dst->Indirect.Index,
-            dst->Indirect.SwizzleX );
-      }
-      else {
-         _dump_register_dst(
-            ctx,
-            dst->Register.File,
-            dst->Register.Index );
-      }
+      _dump_register_dst( ctx, dst );
       _dump_writemask( ctx, dst->Register.WriteMask );
 
       first_reg = FALSE;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index c15d970..5275faa 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -699,6 +699,19 @@
                ++mach->NumOutputs;
             }
          }
+         if (parse.FullToken.FullDeclaration.Declaration.File ==
+             TGSI_FILE_IMMEDIATE_ARRAY) {
+            unsigned reg;
+            struct tgsi_full_declaration *decl =
+               &parse.FullToken.FullDeclaration;
+            debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
+            for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
+               for( i = 0; i < 4; i++ ) {
+                  int idx = reg * 4 + i;
+                  mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
+               }
+            }
+         }
          memcpy(declarations + numDeclarations,
                 &parse.FullToken.FullDeclaration,
                 sizeof(declarations[0]));
@@ -1046,8 +1059,15 @@
    case TGSI_FILE_INPUT:
    case TGSI_FILE_SYSTEM_VALUE:
       for (i = 0; i < QUAD_SIZE; i++) {
-         /* XXX: 2D indexing */
-         chan->u[i] = mach->Inputs[index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i]].xyzw[swizzle].u[i];
+         /*
+         if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
+            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
+                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
+                         index2D->i[i], index->i[i]);
+                         }*/
+         chan->u[i] = mach->Inputs[index2D->i[i] *
+                                   TGSI_EXEC_MAX_INPUT_ATTRIBS +
+                                   index->i[i]].xyzw[swizzle].u[i];
       }
       break;
 
@@ -1060,6 +1080,16 @@
       }
       break;
 
+   case TGSI_FILE_TEMPORARY_ARRAY:
+      for (i = 0; i < QUAD_SIZE; i++) {
+         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
+         assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
+
+         chan->u[i] =
+            mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
+      }
+      break;
+
    case TGSI_FILE_IMMEDIATE:
       for (i = 0; i < QUAD_SIZE; i++) {
          assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
@@ -1069,6 +1099,14 @@
       }
       break;
 
+   case TGSI_FILE_IMMEDIATE_ARRAY:
+      for (i = 0; i < QUAD_SIZE; i++) {
+         assert(index2D->i[i] == 0);
+
+         chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
+      }
+      break;
+
    case TGSI_FILE_ADDRESS:
       for (i = 0; i < QUAD_SIZE; i++) {
          assert(index->i[i] >= 0);
@@ -1280,6 +1318,7 @@
    uint i;
    union tgsi_exec_channel null;
    union tgsi_exec_channel *dst;
+   union tgsi_exec_channel index2D;
    uint execmask = mach->ExecMask;
    int offset = 0;  /* indirection offset */
    int index;
@@ -1325,6 +1364,77 @@
       offset = indir_index.i[0];
    }
 
+   /* There is an extra source register that is a second
+    * subscript to a register file. Effectively it means that
+    * the register file is actually a 2D array of registers.
+    *
+    *    file[3][1],
+    *    where:
+    *       [3] = Dimension.Index
+    */
+   if (reg->Register.Dimension) {
+      index2D.i[0] =
+      index2D.i[1] =
+      index2D.i[2] =
+      index2D.i[3] = reg->Dimension.Index;
+
+      /* Again, the second subscript index can be addressed indirectly
+       * identically to the first one.
+       * Nothing stops us from indirectly addressing the indirect register,
+       * but there is no need for that, so we won't exercise it.
+       *
+       *    file[ind[4].y+3][1],
+       *    where:
+       *       ind = DimIndirect.File
+       *       [4] = DimIndirect.Index
+       *       .y = DimIndirect.SwizzleX
+       */
+      if (reg->Dimension.Indirect) {
+         union tgsi_exec_channel index2;
+         union tgsi_exec_channel indir_index;
+         const uint execmask = mach->ExecMask;
+         unsigned swizzle;
+         uint i;
+
+         index2.i[0] =
+         index2.i[1] =
+         index2.i[2] =
+         index2.i[3] = reg->DimIndirect.Index;
+
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
+         fetch_src_file_channel(mach,
+                                reg->DimIndirect.File,
+                                swizzle,
+                                &index2,
+                                &ZeroVec,
+                                &indir_index);
+
+         index2D.i[0] += indir_index.i[0];
+         index2D.i[1] += indir_index.i[1];
+         index2D.i[2] += indir_index.i[2];
+         index2D.i[3] += indir_index.i[3];
+
+         /* for disabled execution channels, zero-out the index to
+          * avoid using a potential garbage value.
+          */
+         for (i = 0; i < QUAD_SIZE; i++) {
+            if ((execmask & (1 << i)) == 0) {
+               index2D.i[i] = 0;
+            }
+         }
+      }
+
+      /* If by any chance there was a need for a 3D array of register
+       * files, we would have to check whether Dimension is followed
+       * by a dimension register and continue the saga.
+       */
+   } else {
+      index2D.i[0] =
+      index2D.i[1] =
+      index2D.i[2] =
+      index2D.i[3] = 0;
+   }
+
    switch (reg->Register.File) {
    case TGSI_FILE_NULL:
       dst = &null;
@@ -1351,6 +1461,16 @@
       dst = &mach->Temps[offset + index].xyzw[chan_index];
       break;
 
+   case TGSI_FILE_TEMPORARY_ARRAY:
+      index = reg->Register.Index;
+      assert( index < TGSI_EXEC_NUM_TEMPS );
+      assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
+      /* XXX we use index2D.i[0] here but somehow we might
+       * end up with someone trying to store indirectly in
+       * different buffers */
+      dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
+      break;
+
    case TGSI_FILE_ADDRESS:
       index = reg->Register.Index;
       dst = &mach->Addrs[index].xyzw[chan_index];
@@ -1536,6 +1656,19 @@
    }
 }
 
+static void
+conditional_emit_primitive(struct tgsi_exec_machine *mach)
+{
+   if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
+      int emitted_verts =
+         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
+      if (emitted_verts) {
+         emit_primitive(mach);
+      }
+   }
+}
+
+
 /*
  * Fetch four texture samples using STR texture coordinates.
  */
@@ -3185,6 +3318,9 @@
       break;
 
    case TGSI_OPCODE_END:
+      /* make sure we end primitives which haven't
+       * been explicitly emitted */
+      conditional_emit_primitive(mach);
       /* halt execution */
       *pc = -1;
       break;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 3caf820..ccf80ca 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -93,6 +93,7 @@
 
 #define TGSI_EXEC_NUM_TEMPS       128
 #define TGSI_EXEC_NUM_IMMEDIATES  256
+#define TGSI_EXEC_NUM_TEMP_ARRAYS 8
 
 /*
  * Locations of various utility registers (_I = Index, _C = Channel)
@@ -237,9 +238,12 @@
     */
    struct tgsi_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS +
                                        TGSI_EXEC_NUM_TEMP_EXTRAS];
+   struct tgsi_exec_vector       TempArray[TGSI_EXEC_NUM_TEMP_ARRAYS][TGSI_EXEC_NUM_TEMPS];
 
    float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
 
+   float                         ImmArray[TGSI_EXEC_NUM_IMMEDIATES][4];
+
    struct tgsi_exec_vector       Inputs[TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS];
    struct tgsi_exec_vector       Outputs[TGSI_MAX_TOTAL_VERTICES];
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 7e19e1f..db9a342 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -117,6 +117,17 @@
          next_token( ctx, &decl->Semantic );
       }
 
+      if (decl->Declaration.File == TGSI_FILE_IMMEDIATE_ARRAY) {
+         unsigned i, j;
+         decl->ImmediateData.u = (union tgsi_immediate_data*)
+                                 &ctx->Tokens[ctx->Position];
+         for (i = 0; i <= decl->Range.Last; ++i) {
+            for (j = 0; j < 4; ++j) {
+               ctx->Position++;
+            }
+         }
+      }
+
       break;
    }
 
@@ -181,11 +192,6 @@
 
          next_token( ctx, &inst->Dst[i].Register );
 
-         /*
-          * No support for indirect or multi-dimensional addressing.
-          */
-         assert( !inst->Dst[i].Register.Dimension );
-
          if( inst->Dst[i].Register.Indirect ) {
             next_token( ctx, &inst->Dst[i].Indirect );
 
@@ -195,6 +201,24 @@
             assert( !inst->Dst[i].Indirect.Dimension );
             assert( !inst->Dst[i].Indirect.Indirect );
          }
+         if( inst->Dst[i].Register.Dimension ) {
+            next_token( ctx, &inst->Dst[i].Dimension );
+
+            /*
+             * No support for multi-dimensional addressing.
+             */
+            assert( !inst->Dst[i].Dimension.Dimension );
+
+            if( inst->Dst[i].Dimension.Indirect ) {
+               next_token( ctx, &inst->Dst[i].DimIndirect );
+
+               /*
+                * No support for indirect or multi-dimensional addressing.
+                */
+               assert( !inst->Dst[i].Indirect.Indirect );
+               assert( !inst->Dst[i].Indirect.Dimension );
+            }
+         }
       }
 
       assert( inst->Instruction.NumSrcRegs <= TGSI_FULL_MAX_SRC_REGISTERS );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
index b45ccee..36de880 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -44,6 +44,8 @@
 {
    struct tgsi_dst_register               Register;
    struct tgsi_src_register               Indirect;
+   struct tgsi_dimension                  Dimension;
+   struct tgsi_src_register               DimIndirect;
 };
 
 struct tgsi_full_src_register
@@ -54,12 +56,18 @@
    struct tgsi_src_register         DimIndirect;
 };
 
+struct tgsi_immediate_array_data
+{
+   union tgsi_immediate_data *u;
+};
+
 struct tgsi_full_declaration
 {
    struct tgsi_declaration Declaration;
    struct tgsi_declaration_range Range;
    struct tgsi_declaration_dimension Dim;
    struct tgsi_declaration_semantic Semantic;
+   struct tgsi_immediate_array_data ImmediateData;
 };
 
 struct tgsi_full_immediate
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index ce0a92f..97148db 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -90,9 +90,18 @@
 scan_register_dst(scan_register *reg,
                   struct tgsi_full_dst_register *dst)
 {
-   fill_scan_register1d(reg,
-                        dst->Register.File,
-                        dst->Register.Index);
+   if (dst->Register.Dimension) {
+      /*FIXME: right now we don't support indirect
+       * multidimensional addressing */
+      fill_scan_register2d(reg,
+                           dst->Register.File,
+                           dst->Register.Index,
+                           dst->Dimension.Index);
+   } else {
+      fill_scan_register1d(reg,
+                           dst->Register.File,
+                           dst->Register.Index);
+   }
 }
 
 static void
@@ -102,7 +111,6 @@
    if (src->Register.Dimension) {
       /*FIXME: right now we don't support indirect
        * multidimensional addressing */
-      debug_assert(!src->Dimension.Indirect);
       fill_scan_register2d(reg,
                            src->Register.File,
                            src->Register.Index,
@@ -236,7 +244,9 @@
    "ADDR",
    "IMM",
    "PRED",
-   "SV"
+   "SV",
+   "IMMX",
+   "TEMPX"
 };
 
 static boolean
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index 527b7d7..55fccba 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -280,7 +280,9 @@
    "ADDR",
    "IMM",
    "PRED",
-   "SV"
+   "SV",
+   "IMMX",
+   "TEMPX"
 };
 
 static boolean
@@ -345,12 +347,68 @@
    return TRUE;
 }
 
-static boolean
-parse_register_dst( struct translate_ctx *ctx,
-                    uint *file,
-                    int *index );
 
-struct parsed_src_bracket {
+/* <register_file_bracket> ::= <file> `['
+ */
+static boolean
+parse_register_file_bracket(
+   struct translate_ctx *ctx,
+   uint *file )
+{
+   if (!parse_file( &ctx->cur, file )) {
+      report_error( ctx, "Unknown register file" );
+      return FALSE;
+   }
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != '[') {
+      report_error( ctx, "Expected `['" );
+      return FALSE;
+   }
+   ctx->cur++;
+   return TRUE;
+}
+
+/* <register_file_bracket_index> ::= <register_file_bracket> <uint>
+ */
+static boolean
+parse_register_file_bracket_index(
+   struct translate_ctx *ctx,
+   uint *file,
+   int *index )
+{
+   uint uindex;
+
+   if (!parse_register_file_bracket( ctx, file ))
+      return FALSE;
+   eat_opt_white( &ctx->cur );
+   if (!parse_uint( &ctx->cur, &uindex )) {
+      report_error( ctx, "Expected literal unsigned integer" );
+      return FALSE;
+   }
+   *index = (int) uindex;
+   return TRUE;
+}
+
+/* Parse simple 1d register operand.
+ *    <register_dst> ::= <register_file_bracket_index> `]'
+ */
+static boolean
+parse_register_1d(struct translate_ctx *ctx,
+                  uint *file,
+                  int *index )
+{
+   if (!parse_register_file_bracket_index( ctx, file, index ))
+      return FALSE;
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != ']') {
+      report_error( ctx, "Expected `]'" );
+      return FALSE;
+   }
+   ctx->cur++;
+   return TRUE;
+}
+
+struct parsed_bracket {
    int index;
 
    uint ind_file;
@@ -360,21 +418,21 @@
 
 
 static boolean
-parse_register_src_bracket(
+parse_register_bracket(
    struct translate_ctx *ctx,
-   struct parsed_src_bracket *brackets)
+   struct parsed_bracket *brackets)
 {
    const char *cur;
    uint uindex;
 
-   memset(brackets, 0, sizeof(struct parsed_src_bracket));
+   memset(brackets, 0, sizeof(struct parsed_bracket));
 
    eat_opt_white( &ctx->cur );
 
    cur = ctx->cur;
    if (parse_file( &cur, &brackets->ind_file )) {
-      if (!parse_register_dst( ctx, &brackets->ind_file,
-                               &brackets->ind_index ))
+      if (!parse_register_1d( ctx, &brackets->ind_file,
+                              &brackets->ind_index ))
          return FALSE;
       eat_opt_white( &ctx->cur );
 
@@ -443,7 +501,7 @@
 static boolean
 parse_opt_register_src_bracket(
    struct translate_ctx *ctx,
-   struct parsed_src_bracket *brackets,
+   struct parsed_bracket *brackets,
    int *parsed_brackets)
 {
    const char *cur = ctx->cur;
@@ -455,7 +513,7 @@
       ++cur;
       ctx->cur = cur;
 
-      if (!parse_register_src_bracket(ctx, brackets))
+      if (!parse_register_bracket(ctx, brackets))
          return FALSE;
 
       *parsed_brackets = 1;
@@ -464,46 +522,6 @@
    return TRUE;
 }
 
-/* <register_file_bracket> ::= <file> `['
- */
-static boolean
-parse_register_file_bracket(
-   struct translate_ctx *ctx,
-   uint *file )
-{
-   if (!parse_file( &ctx->cur, file )) {
-      report_error( ctx, "Unknown register file" );
-      return FALSE;
-   }
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != '[') {
-      report_error( ctx, "Expected `['" );
-      return FALSE;
-   }
-   ctx->cur++;
-   return TRUE;
-}
-
-/* <register_file_bracket_index> ::= <register_file_bracket> <uint>
- */
-static boolean
-parse_register_file_bracket_index(
-   struct translate_ctx *ctx,
-   uint *file,
-   int *index )
-{
-   uint uindex;
-
-   if (!parse_register_file_bracket( ctx, file ))
-      return FALSE;
-   eat_opt_white( &ctx->cur );
-   if (!parse_uint( &ctx->cur, &uindex )) {
-      report_error( ctx, "Expected literal unsigned integer" );
-      return FALSE;
-   }
-   *index = (int) uindex;
-   return TRUE;
-}
 
 /* Parse source register operand.
  *    <register_src> ::= <register_file_bracket_index> `]' |
@@ -515,13 +533,12 @@
 parse_register_src(
    struct translate_ctx *ctx,
    uint *file,
-   struct parsed_src_bracket *brackets)
+   struct parsed_bracket *brackets)
 {
-
    brackets->ind_comp = TGSI_SWIZZLE_X;
    if (!parse_register_file_bracket( ctx, file ))
       return FALSE;
-   if (!parse_register_src_bracket( ctx, brackets ))
+   if (!parse_register_bracket( ctx, brackets ))
        return FALSE;
 
    return TRUE;
@@ -629,23 +646,19 @@
 }
 
 
-/* Parse destination register operand.
- *    <register_dst> ::= <register_file_bracket_index> `]'
- */
+/* Parse destination register operand.*/
 static boolean
 parse_register_dst(
    struct translate_ctx *ctx,
    uint *file,
-   int *index )
+   struct parsed_bracket *brackets)
 {
-   if (!parse_register_file_bracket_index( ctx, file, index ))
+   brackets->ind_comp = TGSI_SWIZZLE_X;
+   if (!parse_register_file_bracket( ctx, file ))
       return FALSE;
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != ']') {
-      report_error( ctx, "Expected `]'" );
-      return FALSE;
-   }
-   ctx->cur++;
+   if (!parse_register_bracket( ctx, brackets ))
+       return FALSE;
+
    return TRUE;
 }
 
@@ -655,11 +668,14 @@
    struct tgsi_full_dst_register *dst )
 {
    uint file;
-   int index;
    uint writemask;
    const char *cur;
+   struct parsed_bracket bracket[2];
+   int parsed_opt_brackets;
 
-   if (!parse_register_dst( ctx, &file, &index ))
+   if (!parse_register_dst( ctx, &file, &bracket[0] ))
+      return FALSE;
+   if (!parse_opt_register_src_bracket(ctx, &bracket[1], &parsed_opt_brackets))
       return FALSE;
 
    cur = ctx->cur;
@@ -669,8 +685,24 @@
       return FALSE;
 
    dst->Register.File = file;
-   dst->Register.Index = index;
+   if (parsed_opt_brackets) {
+      dst->Register.Dimension = 1;
+      dst->Dimension.Indirect = 0;
+      dst->Dimension.Dimension = 0;
+      dst->Dimension.Index = bracket[0].index;
+      bracket[0] = bracket[1];
+   }
+   dst->Register.Index = bracket[0].index;
    dst->Register.WriteMask = writemask;
+   if (bracket[0].ind_file != TGSI_FILE_NULL) {
+      dst->Register.Indirect = 1;
+      dst->Indirect.File = bracket[0].ind_file;
+      dst->Indirect.Index = bracket[0].ind_index;
+      dst->Indirect.SwizzleX = bracket[0].ind_comp;
+      dst->Indirect.SwizzleY = bracket[0].ind_comp;
+      dst->Indirect.SwizzleZ = bracket[0].ind_comp;
+      dst->Indirect.SwizzleW = bracket[0].ind_comp;
+   }
    return TRUE;
 }
 
@@ -719,7 +751,7 @@
    uint file;
    uint swizzle[4];
    boolean parsed_swizzle;
-   struct parsed_src_bracket bracket[2];
+   struct parsed_bracket bracket[2];
    int parsed_opt_brackets;
 
    if (*ctx->cur == '-') {
@@ -835,7 +867,7 @@
          inst.Predicate.Negate = 1;
       }
 
-      if (!parse_register_dst( ctx, &file, &index ))
+      if (!parse_register_1d( ctx, &file, &index ))
          return FALSE;
 
       if (parse_optional_swizzle( ctx, swizzle, &parsed_swizzle )) {
@@ -985,6 +1017,45 @@
    "PERSPECTIVE"
 };
 
+
+/* parses a 4-touple of the form {x, y, z, w}
+ * where x, y, z, w are numbers */
+static boolean parse_immediate_data(struct translate_ctx *ctx,
+                                    float *values)
+{
+   unsigned i;
+
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != '{') {
+      report_error( ctx, "Expected `{'" );
+      return FALSE;
+   }
+   ctx->cur++;
+   for (i = 0; i < 4; i++) {
+      eat_opt_white( &ctx->cur );
+      if (i > 0) {
+         if (*ctx->cur != ',') {
+            report_error( ctx, "Expected `,'" );
+            return FALSE;
+         }
+         ctx->cur++;
+         eat_opt_white( &ctx->cur );
+      }
+      if (!parse_float( &ctx->cur, &values[i] )) {
+         report_error( ctx, "Expected literal floating point" );
+         return FALSE;
+      }
+   }
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != '}') {
+      report_error( ctx, "Expected `}'" );
+      return FALSE;
+   }
+   ctx->cur++;
+
+   return TRUE;
+}
+
 static boolean parse_declaration( struct translate_ctx *ctx )
 {
    struct tgsi_full_declaration decl;
@@ -995,6 +1066,7 @@
    const char *cur;
    uint advance;
    boolean is_vs_input;
+   boolean is_imm_array;
 
    assert(Elements(semantic_names) == TGSI_SEMANTIC_COUNT);
    assert(Elements(interpolate_names) == TGSI_INTERPOLATE_COUNT);
@@ -1023,8 +1095,9 @@
       decl.Dim.Index2D = brackets[0].first;
    }
 
-   is_vs_input = (file == TGSI_FILE_INPUT && 
+   is_vs_input = (file == TGSI_FILE_INPUT &&
                   ctx->processor == TGSI_PROCESSOR_VERTEX);
+   is_imm_array = (file == TGSI_FILE_IMMEDIATE_ARRAY);
 
    cur = ctx->cur;
    eat_opt_white( &cur );
@@ -1067,6 +1140,44 @@
             break;
          }
       }
+   } else if (is_imm_array) {
+      unsigned i;
+      float *vals_itr;
+      /* we have our immediate data */
+      if (*cur != '{') {
+         report_error( ctx, "Immediate array without data" );
+         return FALSE;
+      }
+      ++cur;
+      ctx->cur = cur;
+
+      decl.ImmediateData.u =
+         MALLOC(sizeof(union tgsi_immediate_data) * 4 *
+                (decl.Range.Last + 1));
+      vals_itr = (float*)decl.ImmediateData.u;
+      for (i = 0; i <= decl.Range.Last; ++i) {
+         if (!parse_immediate_data(ctx, vals_itr)) {
+            FREE(decl.ImmediateData.u);
+            return FALSE;
+         }
+         vals_itr += 4;
+         eat_opt_white( &ctx->cur );
+         if (*ctx->cur != ',') {
+            if (i !=  decl.Range.Last) {
+               report_error( ctx, "Not enough data in immediate array!" );
+               FREE(decl.ImmediateData.u);
+               return FALSE;
+            }
+         } else
+            ++ctx->cur;
+      }
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != '}') {
+         FREE(decl.ImmediateData.u);
+         report_error( ctx, "Immediate array data missing closing '}'" );
+         return FALSE;
+      }
+      ++ctx->cur;
    }
 
    cur = ctx->cur;
@@ -1097,6 +1208,10 @@
       ctx->tokens_cur,
       ctx->header,
       (uint) (ctx->tokens_end - ctx->tokens_cur) );
+
+   if (is_imm_array)
+      FREE(decl.ImmediateData.u);
+
    if (advance == 0)
       return FALSE;
    ctx->tokens_cur += advance;
@@ -1107,7 +1222,6 @@
 static boolean parse_immediate( struct translate_ctx *ctx )
 {
    struct tgsi_full_immediate imm;
-   uint i;
    float values[4];
    uint advance;
 
@@ -1115,37 +1229,13 @@
       report_error( ctx, "Syntax error" );
       return FALSE;
    }
-   if (!str_match_no_case( &ctx->cur, "FLT32" ) || is_digit_alpha_underscore( ctx->cur )) {
+   if (!str_match_no_case( &ctx->cur, "FLT32" ) ||
+       is_digit_alpha_underscore( ctx->cur )) {
       report_error( ctx, "Expected `FLT32'" );
       return FALSE;
    }
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != '{') {
-      report_error( ctx, "Expected `{'" );
-      return FALSE;
-   }
-   ctx->cur++;
-   for (i = 0; i < 4; i++) {
-      eat_opt_white( &ctx->cur );
-      if (i > 0) {
-         if (*ctx->cur != ',') {
-            report_error( ctx, "Expected `,'" );
-            return FALSE;
-         }
-         ctx->cur++;
-         eat_opt_white( &ctx->cur );
-      }
-      if (!parse_float( &ctx->cur, &values[i] )) {
-         report_error( ctx, "Expected literal floating point" );
-         return FALSE;
-      }
-   }
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != '}') {
-      report_error( ctx, "Expected `}'" );
-      return FALSE;
-   }
-   ctx->cur++;
+
+   parse_immediate_data(ctx, values);
 
    imm = tgsi_default_full_immediate();
    imm.Immediate.NrTokens += 4;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 7d357e1..3cf6893 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -732,11 +732,12 @@
 }
 
 
-void 
+void
 ureg_emit_src( struct ureg_program *ureg,
                struct ureg_src src )
 {
-   unsigned size = 1 + (src.Indirect ? 1 : 0) + (src.Dimension ? 1 : 0);
+   unsigned size = 1 + (src.Indirect ? 1 : 0) +
+                   (src.Dimension ? (src.DimIndirect ? 2 : 1) : 0);
 
    union tgsi_any_token *out = get_tokens( ureg, DOMAIN_INSN, size );
    unsigned n = 0;
@@ -769,11 +770,27 @@
    }
 
    if (src.Dimension) {
-      out[0].src.Dimension = 1;
-      out[n].dim.Indirect = 0;
-      out[n].dim.Dimension = 0;
-      out[n].dim.Padding = 0;
-      out[n].dim.Index = src.DimensionIndex;
+      if (src.DimIndirect) {
+         out[0].src.Dimension = 1;
+         out[n].dim.Indirect = 1;
+         out[n].dim.Dimension = 0;
+         out[n].dim.Padding = 0;
+         out[n].dim.Index = src.DimensionIndex;
+         n++;
+         out[n].value = 0;
+         out[n].src.File = src.DimIndFile;
+         out[n].src.SwizzleX = src.DimIndSwizzle;
+         out[n].src.SwizzleY = src.DimIndSwizzle;
+         out[n].src.SwizzleZ = src.DimIndSwizzle;
+         out[n].src.SwizzleW = src.DimIndSwizzle;
+         out[n].src.Index = src.DimIndIndex;
+      } else {
+         out[0].src.Dimension = 1;
+         out[n].dim.Indirect = 0;
+         out[n].dim.Dimension = 0;
+         out[n].dim.Padding = 0;
+         out[n].dim.Index = src.DimensionIndex;
+      }
       n++;
    }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 055545f..07fb01a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -49,14 +49,18 @@
    unsigned SwizzleZ    : 2;  /* TGSI_SWIZZLE_ */
    unsigned SwizzleW    : 2;  /* TGSI_SWIZZLE_ */
    unsigned Indirect    : 1;  /* BOOL */
+   unsigned DimIndirect : 1;  /* BOOL */
    unsigned Dimension   : 1;  /* BOOL */
    unsigned Absolute    : 1;  /* BOOL */
    unsigned Negate      : 1;  /* BOOL */
    int      Index       : 16; /* SINT */
-   unsigned IndirectFile    : 4;  /* TGSI_FILE_ */
-   int      IndirectIndex   : 16; /* SINT */
-   unsigned IndirectSwizzle : 2;  /* TGSI_SWIZZLE_ */
-   int      DimensionIndex  : 16; /* SINT */
+   unsigned IndirectFile     : 4;  /* TGSI_FILE_ */
+   int      IndirectIndex    : 16; /* SINT */
+   unsigned IndirectSwizzle  : 2;  /* TGSI_SWIZZLE_ */
+   int      DimensionIndex   : 16; /* SINT */
+   unsigned DimIndFile       : 4;  /* TGSI_FILE_ */
+   int      DimIndIndex      : 16; /* SINT */
+   unsigned DimIndSwizzle    : 2;  /* TGSI_SWIZZLE_ */
 };
 
 /* Very similar to a tgsi_dst_register, removing unsupported fields
@@ -821,15 +825,31 @@
    return reg;
 }
 
-static INLINE struct ureg_src 
+static INLINE struct ureg_src
 ureg_src_dimension( struct ureg_src reg, int index )
 {
    assert(reg.File != TGSI_FILE_NULL);
    reg.Dimension = 1;
+   reg.DimIndirect = 0;
    reg.DimensionIndex = index;
    return reg;
 }
 
+
+static INLINE struct ureg_src
+ureg_src_dimension_indirect( struct ureg_src reg, struct ureg_src addr,
+                             int index )
+{
+   assert(reg.File != TGSI_FILE_NULL);
+   reg.Dimension = 1;
+   reg.DimIndirect = 1;
+   reg.DimensionIndex = index;
+   reg.DimIndFile = addr.File;
+   reg.DimIndIndex = addr.Index;
+   reg.DimIndSwizzle = addr.SwizzleX;
+   return reg;
+}
+
 static INLINE struct ureg_dst
 ureg_dst( struct ureg_src src )
 {
@@ -874,6 +894,10 @@
    src.Negate = 0;
    src.Dimension = 0;
    src.DimensionIndex = 0;
+   src.DimIndirect = 0;
+   src.DimIndFile = TGSI_FILE_NULL;
+   src.DimIndIndex = 0;
+   src.DimIndSwizzle = 0;
 
    return src;
 }
@@ -897,6 +921,10 @@
    src.Negate    = 0;
    src.Dimension = 0;
    src.DimensionIndex = 0;
+   src.DimIndirect = 0;
+   src.DimIndFile = TGSI_FILE_NULL;
+   src.DimIndIndex = 0;
+   src.DimIndSwizzle = 0;
 
    return src;
 }
@@ -944,7 +972,11 @@
    src.Negate    = 0;
    src.Dimension = 0;
    src.DimensionIndex = 0;
-   
+   src.DimIndirect = 0;
+   src.DimIndFile = TGSI_FILE_NULL;
+   src.DimIndIndex = 0;
+   src.DimIndSwizzle = 0;
+
    return src;
 }
 
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index f8dbd2b..0e43a51 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -378,25 +378,28 @@
 	 char *dst = (vert + 
 		      tg->attrib[attr].output_offset);
 
-         if (tg->attrib[attr].instance_divisor) {
-            index = instance_id / tg->attrib[attr].instance_divisor;
+         if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
+            if (tg->attrib[attr].instance_divisor) {
+               index = instance_id / tg->attrib[attr].instance_divisor;
+            } else {
+               index = elt;
+            }
+
+            index = MIN2(index, tg->attrib[attr].max_index);
+
+            src = tg->attrib[attr].input_ptr +
+                  tg->attrib[attr].input_stride * index;
+
+            tg->attrib[attr].fetch( data, src, 0, 0 );
+
          } else {
-            index = elt;
+            data[0] = (float)instance_id;
          }
-
-         index = MIN2(index, tg->attrib[attr].max_index);
-
-         src = tg->attrib[attr].input_ptr +
-               tg->attrib[attr].input_stride * index;
-
-	 tg->attrib[attr].fetch( data, src, 0, 0 );
-
          if (0) debug_printf("vert %d/%d attr %d: %f %f %f %f\n",
                              i, elt, attr, data[0], data[1], data[2], data[3]);
 
 	 tg->attrib[attr].emit( data, dst );
       }
-      
       vert += tg->translate.key.output_stride;
    }
 }
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index 954f570..5e373ff 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -195,7 +195,7 @@
          namealign = MAX2(namealign, strlen(flags->name));
       for (flags = orig; flags->name; ++flags)
          debug_printf("| %*s [0x%0*lx]%s%s\n", namealign, flags->name,
-                      sizeof(unsigned long)*CHAR_BIT/4, flags->value,
+                      (int)sizeof(unsigned long)*CHAR_BIT/4, flags->value,
                       flags->desc ? " " : "", flags->desc ? flags->desc : "");
    }
    else {
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 6370e77..fe19466 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -567,12 +567,26 @@
 #define MAX3( A, B, C ) MAX2( MAX2( A, B ), C )
 
 
+/**
+ * Align a value, only works pot alignemnts.
+ */
 static INLINE int
 align(int value, int alignment)
 {
    return (value + alignment - 1) & ~(alignment - 1);
 }
 
+/**
+ * Works like align but on npot alignments.
+ */
+static INLINE size_t
+util_align_npot(size_t value, size_t alignment)
+{
+   if (value % alignment)
+      return value + (alignment - (value % alignment));
+   return value;
+}
+
 static INLINE unsigned
 u_minify(unsigned value, unsigned levels)
 {
diff --git a/src/gallium/auxiliary/util/u_pointer.h b/src/gallium/auxiliary/util/u_pointer.h
index ae6f43b..cce0c74 100644
--- a/src/gallium/auxiliary/util/u_pointer.h
+++ b/src/gallium/auxiliary/util/u_pointer.h
@@ -111,6 +111,17 @@
    return pf.f;
 }
 
+static INLINE void *
+func_to_pointer( func_pointer f )
+{
+   union {
+      void *p;
+      func_pointer f;
+   } pf;
+   pf.f = f;
+   return pf.p;
+}
+
 
 #ifdef __cplusplus
 }
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c
index 75d4443..af229e6 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -59,6 +59,8 @@
                                       unsigned usage )
 {
    struct u_upload_mgr *upload = CALLOC_STRUCT( u_upload_mgr );
+   if (!upload)
+      return NULL;
 
    upload->pipe = pipe;
    upload->default_size = default_size;
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index 0242ded..4e35a4c 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -1,3 +1,5 @@
+.. _context:
+
 Context
 =======
 
@@ -120,7 +122,7 @@
 color value. While it is only possible to clear one surface at a time (which can
 include several layers), this surface need not be bound to the framebuffer.
 
-``clear_depth_stencil``clears a single depth, stencil or depth/stencil surface
+``clear_depth_stencil`` clears a single depth, stencil or depth/stencil surface
 with the specified depth and stencil values (for combined depth/stencil buffers,
 is is also possible to only clear one or the other part). While it is only
 possible to clear one surface at a time (which can include several layers),
diff --git a/src/gallium/docs/source/distro.rst b/src/gallium/docs/source/distro.rst
index 100afe3..6ba5a05 100644
--- a/src/gallium/docs/source/distro.rst
+++ b/src/gallium/docs/source/distro.rst
@@ -10,69 +10,100 @@
 Cell
 ^^^^
 
+Simple driver for the IBM Cell architecture. Runs faster than :ref:`softpipe`
+on Cell-based machines.
+
 Failover
 ^^^^^^^^
 
-Deprecated.
+Broken and deprecated.
 
 Intel i915
 ^^^^^^^^^^
 
+Driver for Intel i915 and i945 chipsets.
+
 Intel i965
 ^^^^^^^^^^
 
-Highly experimental.
+Highly experimental driver for Intel i965 chipsets.
 
 Identity
 ^^^^^^^^
 
-Wrapper driver.
+Wrapper driver. The identity driver is a simple skeleton that passes through
+all of its :ref:`Context` and :ref:`Screen` methods to an underlying Context
+and Screen, and as such, it is an excellent starting point for new drivers.
 
 LLVM Softpipe
 ^^^^^^^^^^^^^
 
-nVidia nv30
+A version of :ref:`softpipe` that uses the Low-Level Virtual Machine to
+dynamically generate optimized rasterizing pipelines.
+
+nVidia nvfx
 ^^^^^^^^^^^
 
-nVidia nv40
-^^^^^^^^^^^
+Driver for the nVidia nv30 and nv40 families of GPUs.
 
 nVidia nv50
 ^^^^^^^^^^^
 
+Driver for the nVidia nv50 family of GPUs.
+
 VMWare SVGA
 ^^^^^^^^^^^
 
+Driver for VMWare virtualized guest operating system graphics processing.
+
 ATI r300
 ^^^^^^^^
 
-Testing-quality.
+Driver for the ATI/AMD r300, r400, and r500 families of GPUs.
+
+.. _softpipe:
 
 Softpipe
 ^^^^^^^^
 
-Reference software rasterizer.
+Reference software rasterizer. Slow but accurate.
 
 Trace
 ^^^^^
 
-Wrapper driver.
+Wrapper driver. Trace dumps an XML record of the calls made to the
+:ref:`Context` and :ref:`Screen` objects that it wraps.
 
 State Trackers
 --------------
 
+.. _dri:
+
 Direct Rendering Infrastructure
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+Tracker that implements the client-side DRI protocol, for providing direct
+acceleration services to X11 servers with the DRI extension. Supports DRI1
+and DRI2. Only GL is supported.
+
+.. _egl:
+
 EGL
 ^^^
 
+Tracker for the Khronos EGL standard, used to set up GL and GLES contexts
+without extra knowledge of the underlying windowing system.
+
 GLX
 ^^^
 
 MesaGL
 ^^^^^^
 
+Tracker implementing a GL state machine. Not usable as a standalone tracker;
+Mesa should be built with another state tracker, such as :ref:`DRI` or
+:ref:`EGL`.
+
 Python
 ^^^^^^
 
@@ -82,9 +113,12 @@
 WGL
 ^^^
 
-Xorg XFree86 DDX
+Xorg/XFree86 DDX
 ^^^^^^^^^^^^^^^^
 
+Tracker for XFree86 and Xorg X11 servers. Provides device-dependent
+modesetting and acceleration as a DDX driver.
+
 Auxiliary
 ---------
 
diff --git a/src/gallium/docs/source/glossary.rst b/src/gallium/docs/source/glossary.rst
index 0696cb5..acde56e 100644
--- a/src/gallium/docs/source/glossary.rst
+++ b/src/gallium/docs/source/glossary.rst
@@ -21,3 +21,7 @@
    LOD
       Level of Detail. Also spelled "LoD." The value that determines when the
       switches between mipmaps occur during texture sampling.
+
+   GLSL
+      GL Shading Language. The official, common high-level shader language used
+      in GL 2.0 and above.
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 48d9d57..e3ef49c 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -36,7 +36,9 @@
   bound.
 * ``OCCLUSION_QUERY``: Whether occlusion queries are available.
 * ``TIMER_QUERY``: Whether timer queries are available.
-* ``TEXTURE_SHADOW_MAP``: XXX
+* ``TEXTURE_SHADOW_MAP``: indicates whether the fragment shader hardware
+  can do the depth texture / Z comparison operation in TEX instructions
+  for shadow testing.
 * ``MAX_TEXTURE_2D_LEVELS``: The maximum number of mipmap levels available
   for a 2D texture.
 * ``MAX_TEXTURE_3D_LEVELS``: The maximum number of mipmap levels available
@@ -55,7 +57,13 @@
   from color blend equations, in :ref:`Blend` state.
 * ``SM3``: Whether the vertex shader and fragment shader support equivalent
   opcodes to the Shader Model 3 specification. XXX oh god this is horrible
-* ``MAX_PREDICATE_REGISTERS``: XXX
+* ``MAX_PREDICATE_REGISTERS``: indicates the number of predicate registers
+  available.  Predicate register may be set as a side-effect of ALU
+  instructions to indicate less than, greater than or equal to zero.
+  Later instructions can use a predicate register to control writing to
+  each channel of destination registers.  NOTE: predicate registers have
+  not been fully implemented in Gallium at this time.  See the
+  GL_NV_fragment_program extension for more info (look for "condition codes").
 * ``MAX_COMBINED_SAMPLERS``: The total number of samplers accessible from
   the vertex and fragment shader, inclusive.
 * ``MAX_CONST_BUFFERS``: Maximum number of constant buffers that can be bound
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 411dce8..205e7b8 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -26,9 +26,11 @@
 Instruction Set
 ---------------
 
-From GL_NV_vertex_program
+Core ISA
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
+These opcodes are guaranteed to be available regardless of the driver being
+used.
 
 .. opcode:: ARL - Address Register Load
 
@@ -637,10 +639,6 @@
    Considered for removal.
 
 
-From GL_NV_vertex_program2
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-
 .. opcode:: ARA - Address Register Add
 
   TBD
@@ -827,11 +825,14 @@
    Considered for removal.
 
 
-From GL_NV_gpu_program4
+Compute ISA
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
+These opcodes are primarily provided for special-use computational shaders.
 Support for these opcodes indicated by a special pipe capability bit (TBD).
 
+XXX so let's discuss it, yeah?
+
 .. opcode:: CEIL - Ceiling
 
 .. math::
@@ -989,10 +990,17 @@
 
   TBD
 
+.. note::
 
-From GL_NV_geometry_program4
+   Support for CONT is determined by a special capability bit,
+   ``TGSI_CONT_SUPPORTED``. See :ref:`Screen` for more information.
+
+
+Geometry ISA
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+These opcodes are only supported in geometry shaders; they have no meaning
+in any other type of shader.
 
 .. opcode:: EMIT - Emit
 
@@ -1004,9 +1012,11 @@
   TBD
 
 
-From GLSL
+GLSL ISA
 ^^^^^^^^^^
 
+These opcodes are part of :term:`GLSL`'s opcode set. Support for these
+opcodes is determined by a special capability bit, ``GLSL``.
 
 .. opcode:: BGNLOOP - Begin a Loop
 
@@ -1045,6 +1055,7 @@
 ps_2_x
 ^^^^^^^^^^^^
 
+XXX wait what
 
 .. opcode:: CALLNZ - Subroutine Call If Not Zero
 
@@ -1062,10 +1073,15 @@
 
 .. _doubleopcodes:
 
-Double Opcodes
+Double ISA
 ^^^^^^^^^^^^^^^
 
-.. opcode:: DADD - Add Double
+The double-precision opcodes reinterpret four-component vectors into
+two-component vectors with doubled precision in each component.
+
+Support for these opcodes is XXX undecided. :T
+
+.. opcode:: DADD - Add
 
 .. math::
 
@@ -1074,7 +1090,7 @@
   dst.zw = src0.zw + src1.zw
 
 
-.. opcode:: DDIV - Divide Double
+.. opcode:: DDIV - Divide
 
 .. math::
 
@@ -1082,7 +1098,7 @@
 
   dst.zw = src0.zw / src1.zw
 
-.. opcode:: DSEQ - Set Double on Equal
+.. opcode:: DSEQ - Set on Equal
 
 .. math::
 
@@ -1090,7 +1106,7 @@
 
   dst.zw = src0.zw == src1.zw ? 1.0F : 0.0F
 
-.. opcode:: DSLT - Set Double on Less than
+.. opcode:: DSLT - Set on Less than
 
 .. math::
 
@@ -1098,7 +1114,7 @@
 
   dst.zw = src0.zw < src1.zw ? 1.0F : 0.0F
 
-.. opcode:: DFRAC - Double Fraction
+.. opcode:: DFRAC - Fraction
 
 .. math::
 
@@ -1107,23 +1123,33 @@
   dst.zw = src.zw - \lfloor src.zw\rfloor
 
 
-.. opcode:: DFRACEXP - Convert Double Number to Fractional and Integral Components
+.. opcode:: DFRACEXP - Convert Number to Fractional and Integral Components
+
+Like the ``frexp()`` routine in many math libraries, this opcode stores the
+exponent of its source to ``dst0``, and the significand to ``dst1``, such that
+:math:`dst1 \times 2^{dst0} = src` .
 
 .. math::
 
-  dst0.xy = frexp(src.xy, dst1.xy)
+  dst0.xy = exp(src.xy)
 
-  dst0.zw = frexp(src.zw, dst1.zw)
+  dst1.xy = frac(src.xy)
 
-.. opcode:: DLDEXP - Multiple Double Number by Integral Power of 2
+  dst0.zw = exp(src.zw)
+
+  dst1.zw = frac(src.zw)
+
+.. opcode:: DLDEXP - Multiply Number by Integral Power of 2
+
+This opcode is the inverse of :opcode:`DFRACEXP`.
 
 .. math::
 
-  dst.xy = ldexp(src0.xy, src1.xy)
+  dst.xy = src0.xy \times 2^{src1.xy}
 
-  dst.zw = ldexp(src0.zw, src1.zw)
+  dst.zw = src0.zw \times 2^{src1.zw}
 
-.. opcode:: DMIN - Minimum Double
+.. opcode:: DMIN - Minimum
 
 .. math::
 
@@ -1131,7 +1157,7 @@
 
   dst.zw = min(src0.zw, src1.zw)
 
-.. opcode:: DMAX - Maximum Double
+.. opcode:: DMAX - Maximum
 
 .. math::
 
@@ -1139,7 +1165,7 @@
 
   dst.zw = max(src0.zw, src1.zw)
 
-.. opcode:: DMUL - Multiply Double
+.. opcode:: DMUL - Multiply
 
 .. math::
 
@@ -1148,7 +1174,7 @@
   dst.zw = src0.zw \times src1.zw
 
 
-.. opcode:: DMAD - Multiply And Add Doubles
+.. opcode:: DMAD - Multiply And Add
 
 .. math::
 
@@ -1157,7 +1183,7 @@
   dst.zw = src0.zw \times src1.zw + src2.zw
 
 
-.. opcode:: DRCP - Reciprocal Double
+.. opcode:: DRCP - Reciprocal
 
 .. math::
 
@@ -1165,7 +1191,7 @@
 
    dst.zw = \frac{1}{src.zw}
 
-.. opcode:: DSQRT - Square root double
+.. opcode:: DSQRT - Square Root
 
 .. math::
 
@@ -1280,38 +1306,46 @@
 TGSI_SEMANTIC_POSITION
 """"""""""""""""""""""
 
-Position, sometimes known as HPOS or WPOS for historical reasons, is the
-location of the vertex in space, in ``(x, y, z, w)`` format. ``x``, ``y``, and ``z``
-are the Cartesian coordinates, and ``w`` is the homogenous coordinate and used
-for the perspective divide, if enabled.
+For vertex shaders, TGSI_SEMANTIC_POSITION indicates the vertex shader
+output register which contains the homogeneous vertex position in the clip
+space coordinate system.  After clipping, the X, Y and Z components of the
+vertex will be divided by the W value to get normalized device coordinates.
 
-As a vertex shader output, position should be scaled to the viewport. When
-used in fragment shaders, position will be in window coordinates. The convention
-used depends on the FS_COORD_ORIGIN and FS_COORD_PIXEL_CENTER properties.
+For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
+fragment shader input contains the fragment's window position.  The X
+component starts at zero and always increases from left to right.
+The Y component starts at zero and always increases but Y=0 may either
+indicate the top of the window or the bottom depending on the fragment
+coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN).
+The Z coordinate ranges from 0 to 1 to represent depth from the front
+to the back of the Z buffer.  The W component contains the reciprocol
+of the interpolated vertex position W component.
 
-XXX additionally, is there a way to configure the perspective divide? it's
-accelerated on most chipsets AFAIK...
 
-Position, if not specified, usually defaults to ``(0, 0, 0, 1)``, and can
-be partially specified as ``(x, y, 0, 1)`` or ``(x, y, z, 1)``.
-
-XXX usually? can we solidify that?
 
 TGSI_SEMANTIC_COLOR
 """""""""""""""""""
 
-Colors are used to, well, color the primitives. Colors are always in
-``(r, g, b, a)`` format.
+For vertex shader outputs or fragment shader inputs/outputs, this
+label indicates that the resister contains an R,G,B,A color.
 
-If alpha is not specified, it defaults to 1.
+Several shader inputs/outputs may contain colors so the semantic index
+is used to distinguish them.  For example, color[0] may be the diffuse
+color while color[1] may be the specular color.
+
+This label is needed so that the flat/smooth shading can be applied
+to the right interpolants during rasterization.
+
+
 
 TGSI_SEMANTIC_BCOLOR
 """"""""""""""""""""
 
 Back-facing colors are only used for back-facing polygons, and are only valid
 in vertex shader outputs. After rasterization, all polygons are front-facing
-and COLOR and BCOLOR end up occupying the same slots in the fragment, so
-all BCOLORs effectively become regular COLORs in the fragment shader.
+and COLOR and BCOLOR end up occupying the same slots in the fragment shader,
+so all BCOLORs effectively become regular COLORs in the fragment shader.
+
 
 TGSI_SEMANTIC_FOG
 """""""""""""""""
@@ -1363,7 +1397,15 @@
 TGSI_SEMANTIC_EDGEFLAG
 """"""""""""""""""""""
 
-XXX no clue
+For vertex shaders, this sematic label indicates that an input or
+output is a boolean edge flag.  The register layout is [F, x, x, x]
+where F is 0.0 or 1.0 and x = don't care.  Normally, the vertex shader
+simply copies the edge flag input to the edgeflag output.
+
+Edge flags are used to control which lines or points are actually
+drawn when the polygon mode converts triangles/quads/polygons into
+points or lines.
+
 
 
 Properties
@@ -1420,9 +1462,9 @@
 +--------------------+--------------+--------------------+--------------+
 | Texture Components | Gallium      | OpenGL             | Direct3D 9   |
 +====================+==============+====================+==============+
-| R                  | XXX TBD      | (r, 0, 0, 1)       | (r, 1, 1, 1) |
+| R                  | (r, 0, 0, 1) | (r, 0, 0, 1)       | (r, 1, 1, 1) |
 +--------------------+--------------+--------------------+--------------+
-| RG                 | XXX TBD      | (r, g, 0, 1)       | (r, g, 1, 1) |
+| RG                 | (r, g, 0, 1) | (r, g, 0, 1)       | (r, g, 1, 1) |
 +--------------------+--------------+--------------------+--------------+
 | RGB                | (r, g, b, 1) | (r, g, b, 1)       | (r, g, b, 1) |
 +--------------------+--------------+--------------------+--------------+
diff --git a/src/gallium/drivers/galahad/Makefile b/src/gallium/drivers/galahad/Makefile
new file mode 100644
index 0000000..67d0874
--- /dev/null
+++ b/src/gallium/drivers/galahad/Makefile
@@ -0,0 +1,12 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = galahad
+
+C_SOURCES = \
+	glhd_objects.c \
+	glhd_context.c \
+	glhd_screen.c \
+	glhd_drm.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/galahad/SConscript b/src/gallium/drivers/galahad/SConscript
new file mode 100644
index 0000000..fc668fa
--- /dev/null
+++ b/src/gallium/drivers/galahad/SConscript
@@ -0,0 +1,14 @@
+Import('*')
+
+env = env.Clone()
+
+identity = env.ConvenienceLibrary(
+	target = 'identity',
+	source = [
+		'glhd_context.c',
+		'glhd_drm.c',
+		'glhd_objects.c',
+		'glhd_screen.c',
+	])
+
+Export('identity')
diff --git a/src/gallium/drivers/galahad/glhd_context.c b/src/gallium/drivers/galahad/glhd_context.c
new file mode 100644
index 0000000..3b20cb1
--- /dev/null
+++ b/src/gallium/drivers/galahad/glhd_context.c
@@ -0,0 +1,990 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_context.h"
+
+#include "util/u_format.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+
+#include "glhd_context.h"
+#include "glhd_objects.h"
+
+
+static void
+galahad_destroy(struct pipe_context *_pipe)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->destroy(pipe);
+
+   FREE(glhd_pipe);
+}
+
+static void
+galahad_draw_arrays(struct pipe_context *_pipe,
+                     unsigned prim,
+                     unsigned start,
+                     unsigned count)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->draw_arrays(pipe,
+                     prim,
+                     start,
+                     count);
+}
+
+static void
+galahad_draw_elements(struct pipe_context *_pipe,
+                       struct pipe_resource *_indexResource,
+                       unsigned indexSize,
+                       int indexBias,
+                       unsigned prim,
+                       unsigned start,
+                       unsigned count)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct galahad_resource *glhd_resource = galahad_resource(_indexResource);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+   struct pipe_resource *indexResource = glhd_resource->resource;
+
+   pipe->draw_elements(pipe,
+                       indexResource,
+                       indexSize,
+                       indexBias,
+                       prim,
+                       start,
+                       count);
+}
+
+static void
+galahad_draw_range_elements(struct pipe_context *_pipe,
+                             struct pipe_resource *_indexResource,
+                             unsigned indexSize,
+                             int indexBias,
+                             unsigned minIndex,
+                             unsigned maxIndex,
+                             unsigned mode,
+                             unsigned start,
+                             unsigned count)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct galahad_resource *glhd_resource = galahad_resource(_indexResource);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+   struct pipe_resource *indexResource = glhd_resource->resource;
+
+   pipe->draw_range_elements(pipe,
+                             indexResource,
+                             indexSize,
+                             indexBias,
+                             minIndex,
+                             maxIndex,
+                             mode,
+                             start,
+                             count);
+}
+
+static struct pipe_query *
+galahad_create_query(struct pipe_context *_pipe,
+                      unsigned query_type)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   if (query_type == PIPE_QUERY_OCCLUSION_COUNTER &&
+      !pipe->screen->get_param(pipe->screen, PIPE_CAP_OCCLUSION_QUERY)) {
+      glhd_error("Occlusion query requested but not supported");
+   }
+
+   if (query_type == PIPE_QUERY_TIME_ELAPSED &&
+      !pipe->screen->get_param(pipe->screen, PIPE_CAP_TIMER_QUERY)) {
+      glhd_error("Timer query requested but not supported");
+   }
+
+   return pipe->create_query(pipe,
+                             query_type);
+}
+
+static void
+galahad_destroy_query(struct pipe_context *_pipe,
+                       struct pipe_query *query)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->destroy_query(pipe,
+                       query);
+}
+
+static void
+galahad_begin_query(struct pipe_context *_pipe,
+                     struct pipe_query *query)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->begin_query(pipe,
+                     query);
+}
+
+static void
+galahad_end_query(struct pipe_context *_pipe,
+                   struct pipe_query *query)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->end_query(pipe,
+                   query);
+}
+
+static boolean
+galahad_get_query_result(struct pipe_context *_pipe,
+                          struct pipe_query *query,
+                          boolean wait,
+                          void *result)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   return pipe->get_query_result(pipe,
+                                 query,
+                                 wait,
+                                 result);
+}
+
+static void *
+galahad_create_blend_state(struct pipe_context *_pipe,
+                            const struct pipe_blend_state *blend)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   return pipe->create_blend_state(pipe,
+                                   blend);
+}
+
+static void
+galahad_bind_blend_state(struct pipe_context *_pipe,
+                          void *blend)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->bind_blend_state(pipe,
+                              blend);
+}
+
+static void
+galahad_delete_blend_state(struct pipe_context *_pipe,
+                            void *blend)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->delete_blend_state(pipe,
+                            blend);
+}
+
+static void *
+galahad_create_sampler_state(struct pipe_context *_pipe,
+                              const struct pipe_sampler_state *sampler)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   return pipe->create_sampler_state(pipe,
+                                     sampler);
+}
+
+static void
+galahad_bind_fragment_sampler_states(struct pipe_context *_pipe,
+                                      unsigned num_samplers,
+                                      void **samplers)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->bind_fragment_sampler_states(pipe,
+                                      num_samplers,
+                                      samplers);
+}
+
+static void
+galahad_bind_vertex_sampler_states(struct pipe_context *_pipe,
+                                    unsigned num_samplers,
+                                    void **samplers)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->bind_vertex_sampler_states(pipe,
+                                    num_samplers,
+                                    samplers);
+}
+
+static void
+galahad_delete_sampler_state(struct pipe_context *_pipe,
+                              void *sampler)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->delete_sampler_state(pipe,
+                              sampler);
+}
+
+static void *
+galahad_create_rasterizer_state(struct pipe_context *_pipe,
+                                 const struct pipe_rasterizer_state *rasterizer)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   if (rasterizer->point_quad_rasterization) {
+       if (rasterizer->point_smooth) {
+           glhd_warn("Point smoothing requested but ignored");
+       }
+   } else {
+       if (rasterizer->sprite_coord_enable) {
+           glhd_warn("Point sprites requested but ignored");
+       }
+   }
+
+   return pipe->create_rasterizer_state(pipe,
+                                        rasterizer);
+}
+
+static void
+galahad_bind_rasterizer_state(struct pipe_context *_pipe,
+                               void *rasterizer)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->bind_rasterizer_state(pipe,
+                               rasterizer);
+}
+
+static void
+galahad_delete_rasterizer_state(struct pipe_context *_pipe,
+                                 void *rasterizer)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->delete_rasterizer_state(pipe,
+                                 rasterizer);
+}
+
+static void *
+galahad_create_depth_stencil_alpha_state(struct pipe_context *_pipe,
+                                          const struct pipe_depth_stencil_alpha_state *depth_stencil_alpha)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   return pipe->create_depth_stencil_alpha_state(pipe,
+                                                 depth_stencil_alpha);
+}
+
+static void
+galahad_bind_depth_stencil_alpha_state(struct pipe_context *_pipe,
+                                        void *depth_stencil_alpha)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->bind_depth_stencil_alpha_state(pipe,
+                                        depth_stencil_alpha);
+}
+
+static void
+galahad_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
+                                          void *depth_stencil_alpha)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->delete_depth_stencil_alpha_state(pipe,
+                                          depth_stencil_alpha);
+}
+
+static void *
+galahad_create_fs_state(struct pipe_context *_pipe,
+                         const struct pipe_shader_state *fs)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   return pipe->create_fs_state(pipe,
+                                fs);
+}
+
+static void
+galahad_bind_fs_state(struct pipe_context *_pipe,
+                       void *fs)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->bind_fs_state(pipe,
+                       fs);
+}
+
+static void
+galahad_delete_fs_state(struct pipe_context *_pipe,
+                         void *fs)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->delete_fs_state(pipe,
+                         fs);
+}
+
+static void *
+galahad_create_vs_state(struct pipe_context *_pipe,
+                         const struct pipe_shader_state *vs)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   return pipe->create_vs_state(pipe,
+                                vs);
+}
+
+static void
+galahad_bind_vs_state(struct pipe_context *_pipe,
+                       void *vs)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->bind_vs_state(pipe,
+                       vs);
+}
+
+static void
+galahad_delete_vs_state(struct pipe_context *_pipe,
+                         void *vs)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->delete_vs_state(pipe,
+                         vs);
+}
+
+
+static void *
+galahad_create_vertex_elements_state(struct pipe_context *_pipe,
+                                      unsigned num_elements,
+                                      const struct pipe_vertex_element *vertex_elements)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   return pipe->create_vertex_elements_state(pipe,
+                                             num_elements,
+                                             vertex_elements);
+}
+
+static void
+galahad_bind_vertex_elements_state(struct pipe_context *_pipe,
+                                    void *velems)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->bind_vertex_elements_state(pipe,
+                                    velems);
+}
+
+static void
+galahad_delete_vertex_elements_state(struct pipe_context *_pipe,
+                                      void *velems)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->delete_vertex_elements_state(pipe,
+                                      velems);
+}
+
+static void
+galahad_set_blend_color(struct pipe_context *_pipe,
+                         const struct pipe_blend_color *blend_color)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->set_blend_color(pipe,
+                         blend_color);
+}
+
+static void
+galahad_set_stencil_ref(struct pipe_context *_pipe,
+                         const struct pipe_stencil_ref *stencil_ref)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->set_stencil_ref(pipe,
+                         stencil_ref);
+}
+
+static void
+galahad_set_clip_state(struct pipe_context *_pipe,
+                        const struct pipe_clip_state *clip)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->set_clip_state(pipe,
+                        clip);
+}
+
+static void
+galahad_set_sample_mask(struct pipe_context *_pipe,
+                         unsigned sample_mask)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->set_sample_mask(pipe,
+                         sample_mask);
+}
+
+static void
+galahad_set_constant_buffer(struct pipe_context *_pipe,
+                             uint shader,
+                             uint index,
+                             struct pipe_resource *_resource)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+   struct pipe_resource *unwrapped_resource;
+   struct pipe_resource *resource = NULL;
+
+   /* XXX hmm? unwrap the input state */
+   if (_resource) {
+      unwrapped_resource = galahad_resource_unwrap(_resource);
+      resource = unwrapped_resource;
+   }
+
+   pipe->set_constant_buffer(pipe,
+                             shader,
+                             index,
+                             resource);
+}
+
+static void
+galahad_set_framebuffer_state(struct pipe_context *_pipe,
+                               const struct pipe_framebuffer_state *_state)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+   struct pipe_framebuffer_state unwrapped_state;
+   struct pipe_framebuffer_state *state = NULL;
+   unsigned i;
+
+   if (_state->nr_cbufs > PIPE_MAX_COLOR_BUFS) {
+      glhd_error("%d render targets bound, but only %d are permitted by API",
+         _state->nr_cbufs, PIPE_MAX_COLOR_BUFS);
+   } else if (_state->nr_cbufs >
+      pipe->screen->get_param(pipe->screen, PIPE_CAP_MAX_RENDER_TARGETS)) {
+      glhd_warn("%d render targets bound, but only %d are supported",
+         _state->nr_cbufs,
+         pipe->screen->get_param(pipe->screen, PIPE_CAP_MAX_RENDER_TARGETS));
+   }
+
+   /* unwrap the input state */
+   if (_state) {
+      memcpy(&unwrapped_state, _state, sizeof(unwrapped_state));
+      for(i = 0; i < _state->nr_cbufs; i++)
+         unwrapped_state.cbufs[i] = galahad_surface_unwrap(_state->cbufs[i]);
+      for (; i < PIPE_MAX_COLOR_BUFS; i++)
+         unwrapped_state.cbufs[i] = NULL;
+      unwrapped_state.zsbuf = galahad_surface_unwrap(_state->zsbuf);
+      state = &unwrapped_state;
+   }
+
+   pipe->set_framebuffer_state(pipe,
+                               state);
+}
+
+static void
+galahad_set_polygon_stipple(struct pipe_context *_pipe,
+                             const struct pipe_poly_stipple *poly_stipple)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->set_polygon_stipple(pipe,
+                             poly_stipple);
+}
+
+static void
+galahad_set_scissor_state(struct pipe_context *_pipe,
+                           const struct pipe_scissor_state *scissor)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->set_scissor_state(pipe,
+                           scissor);
+}
+
+static void
+galahad_set_viewport_state(struct pipe_context *_pipe,
+                            const struct pipe_viewport_state *viewport)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->set_viewport_state(pipe,
+                            viewport);
+}
+
+static void
+galahad_set_fragment_sampler_views(struct pipe_context *_pipe,
+                                    unsigned num,
+                                    struct pipe_sampler_view **_views)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+   struct pipe_sampler_view *unwrapped_views[PIPE_MAX_SAMPLERS];
+   struct pipe_sampler_view **views = NULL;
+   unsigned i;
+
+   if (_views) {
+      for (i = 0; i < num; i++)
+         unwrapped_views[i] = galahad_sampler_view_unwrap(_views[i]);
+      for (; i < PIPE_MAX_SAMPLERS; i++)
+         unwrapped_views[i] = NULL;
+
+      views = unwrapped_views;
+   }
+
+   pipe->set_fragment_sampler_views(pipe, num, views);
+}
+
+static void
+galahad_set_vertex_sampler_views(struct pipe_context *_pipe,
+                                  unsigned num,
+                                  struct pipe_sampler_view **_views)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+   struct pipe_sampler_view *unwrapped_views[PIPE_MAX_VERTEX_SAMPLERS];
+   struct pipe_sampler_view **views = NULL;
+   unsigned i;
+
+   if (_views) {
+      for (i = 0; i < num; i++)
+         unwrapped_views[i] = galahad_sampler_view_unwrap(_views[i]);
+      for (; i < PIPE_MAX_VERTEX_SAMPLERS; i++)
+         unwrapped_views[i] = NULL;
+
+      views = unwrapped_views;
+   }
+
+   pipe->set_vertex_sampler_views(pipe, num, views);
+}
+
+static void
+galahad_set_vertex_buffers(struct pipe_context *_pipe,
+                            unsigned num_buffers,
+                            const struct pipe_vertex_buffer *_buffers)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+   struct pipe_vertex_buffer unwrapped_buffers[PIPE_MAX_SHADER_INPUTS];
+   struct pipe_vertex_buffer *buffers = NULL;
+   unsigned i;
+
+   if (num_buffers) {
+      memcpy(unwrapped_buffers, _buffers, num_buffers * sizeof(*_buffers));
+      for (i = 0; i < num_buffers; i++)
+         unwrapped_buffers[i].buffer = galahad_resource_unwrap(_buffers[i].buffer);
+      buffers = unwrapped_buffers;
+   }
+
+   pipe->set_vertex_buffers(pipe,
+                            num_buffers,
+                            buffers);
+}
+static void
+galahad_resource_copy_region(struct pipe_context *_pipe,
+                              struct pipe_resource *_dst,
+                              struct pipe_subresource subdst,
+                              unsigned dstx,
+                              unsigned dsty,
+                              unsigned dstz,
+                              struct pipe_resource *_src,
+                              struct pipe_subresource subsrc,
+                              unsigned srcx,
+                              unsigned srcy,
+                              unsigned srcz,
+                              unsigned width,
+                              unsigned height)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct galahad_resource *glhd_resource_dst = galahad_resource(_dst);
+   struct galahad_resource *glhd_resource_src = galahad_resource(_src);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+   struct pipe_resource *dst = glhd_resource_dst->resource;
+   struct pipe_resource *src = glhd_resource_src->resource;
+
+   if (_dst->format != _src->format) {
+      glhd_warn("Format mismatch: Source is %s, destination is %s",
+         util_format_short_name(_src->format),
+         util_format_short_name(_dst->format));
+   }
+
+   pipe->resource_copy_region(pipe,
+                              dst,
+                              subdst,
+                              dstx,
+                              dsty,
+                              dstz,
+                              src,
+                              subsrc,
+                              srcx,
+                              srcy,
+                              srcz,
+                              width,
+                              height);
+}
+
+static void
+galahad_clear(struct pipe_context *_pipe,
+               unsigned buffers,
+               const float *rgba,
+               double depth,
+               unsigned stencil)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->clear(pipe,
+               buffers,
+               rgba,
+               depth,
+               stencil);
+}
+
+static void
+galahad_clear_render_target(struct pipe_context *_pipe,
+                             struct pipe_surface *_dst,
+                             const float *rgba,
+                             unsigned dstx, unsigned dsty,
+                             unsigned width, unsigned height)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct galahad_surface *glhd_surface_dst = galahad_surface(_dst);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+   struct pipe_surface *dst = glhd_surface_dst->surface;
+
+   pipe->clear_render_target(pipe,
+                             dst,
+                             rgba,
+                             dstx,
+                             dsty,
+                             width,
+                             height);
+}
+static void
+galahad_clear_depth_stencil(struct pipe_context *_pipe,
+                             struct pipe_surface *_dst,
+                             unsigned clear_flags,
+                             double depth,
+                             unsigned stencil,
+                             unsigned dstx, unsigned dsty,
+                             unsigned width, unsigned height)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct galahad_surface *glhd_surface_dst = galahad_surface(_dst);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+   struct pipe_surface *dst = glhd_surface_dst->surface;
+
+   pipe->clear_depth_stencil(pipe,
+                             dst,
+                             clear_flags,
+                             depth,
+                             stencil,
+                             dstx,
+                             dsty,
+                             width,
+                             height);
+
+}
+
+static void
+galahad_flush(struct pipe_context *_pipe,
+               unsigned flags,
+               struct pipe_fence_handle **fence)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+
+   pipe->flush(pipe,
+               flags,
+               fence);
+}
+
+static unsigned int
+galahad_is_resource_referenced(struct pipe_context *_pipe,
+                                struct pipe_resource *_resource,
+                                unsigned face,
+                                unsigned level)
+{
+   struct galahad_context *glhd_pipe = galahad_context(_pipe);
+   struct galahad_resource *glhd_resource = galahad_resource(_resource);
+   struct pipe_context *pipe = glhd_pipe->pipe;
+   struct pipe_resource *resource = glhd_resource->resource;
+
+   return pipe->is_resource_referenced(pipe,
+                                       resource,
+                                       face,
+                                       level);
+}
+
+static struct pipe_sampler_view *
+galahad_context_create_sampler_view(struct pipe_context *_pipe,
+                                     struct pipe_resource *_resource,
+                                     const struct pipe_sampler_view *templ)
+{
+   struct galahad_context *glhd_context = galahad_context(_pipe);
+   struct galahad_resource *glhd_resource = galahad_resource(_resource);
+   struct pipe_context *pipe = glhd_context->pipe;
+   struct pipe_resource *resource = glhd_resource->resource;
+   struct pipe_sampler_view *result;
+
+   result = pipe->create_sampler_view(pipe,
+                                      resource,
+                                      templ);
+
+   if (result)
+      return galahad_sampler_view_create(glhd_context, glhd_resource, result);
+   return NULL;
+}
+
+static void
+galahad_context_sampler_view_destroy(struct pipe_context *_pipe,
+                                      struct pipe_sampler_view *_view)
+{
+   galahad_sampler_view_destroy(galahad_context(_pipe),
+                                 galahad_sampler_view(_view));
+}
+
+static struct pipe_transfer *
+galahad_context_get_transfer(struct pipe_context *_context,
+                              struct pipe_resource *_resource,
+                              struct pipe_subresource sr,
+                              unsigned usage,
+                              const struct pipe_box *box)
+{
+   struct galahad_context *glhd_context = galahad_context(_context);
+   struct galahad_resource *glhd_resource = galahad_resource(_resource);
+   struct pipe_context *context = glhd_context->pipe;
+   struct pipe_resource *resource = glhd_resource->resource;
+   struct pipe_transfer *result;
+
+   result = context->get_transfer(context,
+                                  resource,
+                                  sr,
+                                  usage,
+                                  box);
+
+   if (result)
+      return galahad_transfer_create(glhd_context, glhd_resource, result);
+   return NULL;
+}
+
+static void
+galahad_context_transfer_destroy(struct pipe_context *_pipe,
+                                  struct pipe_transfer *_transfer)
+{
+   galahad_transfer_destroy(galahad_context(_pipe),
+                             galahad_transfer(_transfer));
+}
+
+static void *
+galahad_context_transfer_map(struct pipe_context *_context,
+                              struct pipe_transfer *_transfer)
+{
+   struct galahad_context *glhd_context = galahad_context(_context);
+   struct galahad_transfer *glhd_transfer = galahad_transfer(_transfer);
+   struct pipe_context *context = glhd_context->pipe;
+   struct pipe_transfer *transfer = glhd_transfer->transfer;
+
+   return context->transfer_map(context,
+                                transfer);
+}
+
+
+
+static void
+galahad_context_transfer_flush_region(struct pipe_context *_context,
+                                       struct pipe_transfer *_transfer,
+                                       const struct pipe_box *box)
+{
+   struct galahad_context *glhd_context = galahad_context(_context);
+   struct galahad_transfer *glhd_transfer = galahad_transfer(_transfer);
+   struct pipe_context *context = glhd_context->pipe;
+   struct pipe_transfer *transfer = glhd_transfer->transfer;
+
+   context->transfer_flush_region(context,
+                                  transfer,
+                                  box);
+}
+
+
+static void
+galahad_context_transfer_unmap(struct pipe_context *_context,
+                                struct pipe_transfer *_transfer)
+{
+   struct galahad_context *glhd_context = galahad_context(_context);
+   struct galahad_transfer *glhd_transfer = galahad_transfer(_transfer);
+   struct pipe_context *context = glhd_context->pipe;
+   struct pipe_transfer *transfer = glhd_transfer->transfer;
+
+   context->transfer_unmap(context,
+                           transfer);
+}
+
+
+static void
+galahad_context_transfer_inline_write(struct pipe_context *_context,
+                                       struct pipe_resource *_resource,
+                                       struct pipe_subresource sr,
+                                       unsigned usage,
+                                       const struct pipe_box *box,
+                                       const void *data,
+                                       unsigned stride,
+                                       unsigned slice_stride)
+{
+   struct galahad_context *glhd_context = galahad_context(_context);
+   struct galahad_resource *glhd_resource = galahad_resource(_resource);
+   struct pipe_context *context = glhd_context->pipe;
+   struct pipe_resource *resource = glhd_resource->resource;
+
+   context->transfer_inline_write(context,
+                                  resource,
+                                  sr,
+                                  usage,
+                                  box,
+                                  data,
+                                  stride,
+                                  slice_stride);
+}
+
+
+struct pipe_context *
+galahad_context_create(struct pipe_screen *_screen, struct pipe_context *pipe)
+{
+   struct galahad_context *glhd_pipe;
+   (void)galahad_screen(_screen);
+
+   glhd_pipe = CALLOC_STRUCT(galahad_context);
+   if (!glhd_pipe) {
+      return NULL;
+   }
+
+   glhd_pipe->base.winsys = NULL;
+   glhd_pipe->base.screen = _screen;
+   glhd_pipe->base.priv = pipe->priv; /* expose wrapped data */
+   glhd_pipe->base.draw = NULL;
+
+   glhd_pipe->base.destroy = galahad_destroy;
+   glhd_pipe->base.draw_arrays = galahad_draw_arrays;
+   glhd_pipe->base.draw_elements = galahad_draw_elements;
+   glhd_pipe->base.draw_range_elements = galahad_draw_range_elements;
+   glhd_pipe->base.create_query = galahad_create_query;
+   glhd_pipe->base.destroy_query = galahad_destroy_query;
+   glhd_pipe->base.begin_query = galahad_begin_query;
+   glhd_pipe->base.end_query = galahad_end_query;
+   glhd_pipe->base.get_query_result = galahad_get_query_result;
+   glhd_pipe->base.create_blend_state = galahad_create_blend_state;
+   glhd_pipe->base.bind_blend_state = galahad_bind_blend_state;
+   glhd_pipe->base.delete_blend_state = galahad_delete_blend_state;
+   glhd_pipe->base.create_sampler_state = galahad_create_sampler_state;
+   glhd_pipe->base.bind_fragment_sampler_states = galahad_bind_fragment_sampler_states;
+   glhd_pipe->base.bind_vertex_sampler_states = galahad_bind_vertex_sampler_states;
+   glhd_pipe->base.delete_sampler_state = galahad_delete_sampler_state;
+   glhd_pipe->base.create_rasterizer_state = galahad_create_rasterizer_state;
+   glhd_pipe->base.bind_rasterizer_state = galahad_bind_rasterizer_state;
+   glhd_pipe->base.delete_rasterizer_state = galahad_delete_rasterizer_state;
+   glhd_pipe->base.create_depth_stencil_alpha_state = galahad_create_depth_stencil_alpha_state;
+   glhd_pipe->base.bind_depth_stencil_alpha_state = galahad_bind_depth_stencil_alpha_state;
+   glhd_pipe->base.delete_depth_stencil_alpha_state = galahad_delete_depth_stencil_alpha_state;
+   glhd_pipe->base.create_fs_state = galahad_create_fs_state;
+   glhd_pipe->base.bind_fs_state = galahad_bind_fs_state;
+   glhd_pipe->base.delete_fs_state = galahad_delete_fs_state;
+   glhd_pipe->base.create_vs_state = galahad_create_vs_state;
+   glhd_pipe->base.bind_vs_state = galahad_bind_vs_state;
+   glhd_pipe->base.delete_vs_state = galahad_delete_vs_state;
+   glhd_pipe->base.create_vertex_elements_state = galahad_create_vertex_elements_state;
+   glhd_pipe->base.bind_vertex_elements_state = galahad_bind_vertex_elements_state;
+   glhd_pipe->base.delete_vertex_elements_state = galahad_delete_vertex_elements_state;
+   glhd_pipe->base.set_blend_color = galahad_set_blend_color;
+   glhd_pipe->base.set_stencil_ref = galahad_set_stencil_ref;
+   glhd_pipe->base.set_clip_state = galahad_set_clip_state;
+   glhd_pipe->base.set_sample_mask = galahad_set_sample_mask;
+   glhd_pipe->base.set_constant_buffer = galahad_set_constant_buffer;
+   glhd_pipe->base.set_framebuffer_state = galahad_set_framebuffer_state;
+   glhd_pipe->base.set_polygon_stipple = galahad_set_polygon_stipple;
+   glhd_pipe->base.set_scissor_state = galahad_set_scissor_state;
+   glhd_pipe->base.set_viewport_state = galahad_set_viewport_state;
+   glhd_pipe->base.set_fragment_sampler_views = galahad_set_fragment_sampler_views;
+   glhd_pipe->base.set_vertex_sampler_views = galahad_set_vertex_sampler_views;
+   glhd_pipe->base.set_vertex_buffers = galahad_set_vertex_buffers;
+   glhd_pipe->base.resource_copy_region = galahad_resource_copy_region;
+   glhd_pipe->base.clear = galahad_clear;
+   glhd_pipe->base.clear_render_target = galahad_clear_render_target;
+   glhd_pipe->base.clear_depth_stencil = galahad_clear_depth_stencil;
+   glhd_pipe->base.flush = galahad_flush;
+   glhd_pipe->base.is_resource_referenced = galahad_is_resource_referenced;
+   glhd_pipe->base.create_sampler_view = galahad_context_create_sampler_view;
+   glhd_pipe->base.sampler_view_destroy = galahad_context_sampler_view_destroy;
+   glhd_pipe->base.get_transfer = galahad_context_get_transfer;
+   glhd_pipe->base.transfer_destroy = galahad_context_transfer_destroy;
+   glhd_pipe->base.transfer_map = galahad_context_transfer_map;
+   glhd_pipe->base.transfer_unmap = galahad_context_transfer_unmap;
+   glhd_pipe->base.transfer_flush_region = galahad_context_transfer_flush_region;
+   glhd_pipe->base.transfer_inline_write = galahad_context_transfer_inline_write;
+
+   glhd_pipe->pipe = pipe;
+
+   return &glhd_pipe->base;
+}
diff --git a/src/gallium/drivers/galahad/glhd_context.h b/src/gallium/drivers/galahad/glhd_context.h
new file mode 100644
index 0000000..4e71753
--- /dev/null
+++ b/src/gallium/drivers/galahad/glhd_context.h
@@ -0,0 +1,64 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef GLHD_CONTEXT_H
+#define GLHD_CONTEXT_H
+
+#include <stdio.h>
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+
+
+struct galahad_context {
+   struct pipe_context base;  /**< base class */
+
+   struct pipe_context *pipe;
+};
+
+
+struct pipe_context *
+galahad_context_create(struct pipe_screen *screen, struct pipe_context *pipe);
+
+
+static INLINE struct galahad_context *
+galahad_context(struct pipe_context *pipe)
+{
+   return (struct galahad_context *)pipe;
+}
+
+#define glhd_warn(...) \
+do { \
+    fprintf(stderr, "galahad: %s: ", __FUNCTION__); \
+    fprintf(stderr, __VA_ARGS__); \
+    fprintf(stderr, "\n"); \
+} while (0)
+
+#define glhd_error(...) \
+    glhd_warn(__VA_ARGS__);
+
+#endif /* GLHD_CONTEXT_H */
diff --git a/src/gallium/drivers/galahad/glhd_drm.c b/src/gallium/drivers/galahad/glhd_drm.c
new file mode 100644
index 0000000..d62f6f4
--- /dev/null
+++ b/src/gallium/drivers/galahad/glhd_drm.c
@@ -0,0 +1,96 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "state_tracker/drm_api.h"
+
+#include "util/u_memory.h"
+#include "glhd_drm.h"
+#include "glhd_screen.h"
+#include "glhd_public.h"
+
+struct galahad_drm_api
+{
+   struct drm_api base;
+
+   struct drm_api *api;
+};
+
+static INLINE struct galahad_drm_api *
+galahad_drm_api(struct drm_api *_api)
+{
+   return (struct galahad_drm_api *)_api;
+}
+
+static struct pipe_screen *
+galahad_drm_create_screen(struct drm_api *_api, int fd)
+{
+   struct galahad_drm_api *glhd_api = galahad_drm_api(_api);
+   struct drm_api *api = glhd_api->api;
+   struct pipe_screen *screen;
+
+   screen = api->create_screen(api, fd);
+
+   return galahad_screen_create(screen);
+}
+
+static void
+galahad_drm_destroy(struct drm_api *_api)
+{
+   struct galahad_drm_api *glhd_api = galahad_drm_api(_api);
+   struct drm_api *api = glhd_api->api;
+   api->destroy(api);
+
+   FREE(glhd_api);
+}
+
+struct drm_api *
+galahad_drm_create(struct drm_api *api)
+{
+   struct galahad_drm_api *glhd_api;
+
+   if (!api)
+      goto error;
+
+   if (!debug_get_option("GALAHAD", FALSE))
+      goto error;
+
+   glhd_api = CALLOC_STRUCT(galahad_drm_api);
+
+   if (!glhd_api)
+      goto error;
+
+   glhd_api->base.name = api->name;
+   glhd_api->base.driver_name = api->driver_name;
+   glhd_api->base.create_screen = galahad_drm_create_screen;
+   glhd_api->base.destroy = galahad_drm_destroy;
+   glhd_api->api = api;
+
+   return &glhd_api->base;
+
+error:
+   return api;
+}
diff --git a/src/gallium/drivers/galahad/glhd_drm.h b/src/gallium/drivers/galahad/glhd_drm.h
new file mode 100644
index 0000000..613ac24
--- /dev/null
+++ b/src/gallium/drivers/galahad/glhd_drm.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef GLHD_DRM_H
+#define GLHD_DRM_H
+
+struct drm_api;
+
+struct drm_api* galahad_drm_create(struct drm_api *api);
+
+#endif /* GLHD_DRM_H */
diff --git a/src/gallium/drivers/galahad/glhd_objects.c b/src/gallium/drivers/galahad/glhd_objects.c
new file mode 100644
index 0000000..6c5a21a
--- /dev/null
+++ b/src/gallium/drivers/galahad/glhd_objects.c
@@ -0,0 +1,187 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+
+#include "glhd_screen.h"
+#include "glhd_objects.h"
+#include "glhd_context.h"
+
+
+
+struct pipe_resource *
+galahad_resource_create(struct galahad_screen *glhd_screen,
+                        struct pipe_resource *resource)
+{
+   struct galahad_resource *glhd_resource;
+
+   if(!resource)
+      goto error;
+
+   assert(resource->screen == glhd_screen->screen);
+
+   glhd_resource = CALLOC_STRUCT(galahad_resource);
+   if(!glhd_resource)
+      goto error;
+
+   memcpy(&glhd_resource->base, resource, sizeof(struct pipe_resource));
+
+   pipe_reference_init(&glhd_resource->base.reference, 1);
+   glhd_resource->base.screen = &glhd_screen->base;
+   glhd_resource->resource = resource;
+
+   return &glhd_resource->base;
+
+error:
+   pipe_resource_reference(&resource, NULL);
+   return NULL;
+}
+
+void
+galahad_resource_destroy(struct galahad_resource *glhd_resource)
+{
+   pipe_resource_reference(&glhd_resource->resource, NULL);
+   FREE(glhd_resource);
+}
+
+
+struct pipe_surface *
+galahad_surface_create(struct galahad_resource *glhd_resource,
+                        struct pipe_surface *surface)
+{
+   struct galahad_surface *glhd_surface;
+
+   if(!surface)
+      goto error;
+
+   assert(surface->texture == glhd_resource->resource);
+
+   glhd_surface = CALLOC_STRUCT(galahad_surface);
+   if(!glhd_surface)
+      goto error;
+
+   memcpy(&glhd_surface->base, surface, sizeof(struct pipe_surface));
+
+   pipe_reference_init(&glhd_surface->base.reference, 1);
+   glhd_surface->base.texture = NULL;
+   pipe_resource_reference(&glhd_surface->base.texture, &glhd_resource->base);
+   glhd_surface->surface = surface;
+
+   return &glhd_surface->base;
+
+error:
+   pipe_surface_reference(&surface, NULL);
+   return NULL;
+}
+
+void
+galahad_surface_destroy(struct galahad_surface *glhd_surface)
+{
+   pipe_resource_reference(&glhd_surface->base.texture, NULL);
+   pipe_surface_reference(&glhd_surface->surface, NULL);
+   FREE(glhd_surface);
+}
+
+
+struct pipe_sampler_view *
+galahad_sampler_view_create(struct galahad_context *glhd_context,
+                             struct galahad_resource *glhd_resource,
+                             struct pipe_sampler_view *view)
+{
+   struct galahad_sampler_view *glhd_view;
+
+   if (!view)
+      goto error;
+
+   assert(view->texture == glhd_resource->resource);
+
+   glhd_view = CALLOC_STRUCT(galahad_sampler_view);
+
+   glhd_view->base = *view;
+   glhd_view->base.reference.count = 1;
+   glhd_view->base.texture = NULL;
+   pipe_resource_reference(&glhd_view->base.texture, glhd_resource->resource);
+   glhd_view->base.context = glhd_context->pipe;
+   glhd_view->sampler_view = view;
+
+   return &glhd_view->base;
+error:
+   return NULL;
+}
+
+void
+galahad_sampler_view_destroy(struct galahad_context *glhd_context,
+                              struct galahad_sampler_view *glhd_view)
+{
+   pipe_resource_reference(&glhd_view->base.texture, NULL);
+   glhd_context->pipe->sampler_view_destroy(glhd_context->pipe,
+                                          glhd_view->sampler_view);
+   FREE(glhd_view);
+}
+
+
+struct pipe_transfer *
+galahad_transfer_create(struct galahad_context *glhd_context,
+                         struct galahad_resource *glhd_resource,
+                         struct pipe_transfer *transfer)
+{
+   struct galahad_transfer *glhd_transfer;
+
+   if(!transfer)
+      goto error;
+
+   assert(transfer->resource == glhd_resource->resource);
+
+   glhd_transfer = CALLOC_STRUCT(galahad_transfer);
+   if(!glhd_transfer)
+      goto error;
+
+   memcpy(&glhd_transfer->base, transfer, sizeof(struct pipe_transfer));
+
+   glhd_transfer->base.resource = NULL;
+   glhd_transfer->transfer = transfer;
+
+   pipe_resource_reference(&glhd_transfer->base.resource, &glhd_resource->base);
+   assert(glhd_transfer->base.resource == &glhd_resource->base);
+
+   return &glhd_transfer->base;
+
+error:
+   glhd_context->pipe->transfer_destroy(glhd_context->pipe, transfer);
+   return NULL;
+}
+
+void
+galahad_transfer_destroy(struct galahad_context *glhd_context,
+                          struct galahad_transfer *glhd_transfer)
+{
+   pipe_resource_reference(&glhd_transfer->base.resource, NULL);
+   glhd_context->pipe->transfer_destroy(glhd_context->pipe,
+                                        glhd_transfer->transfer);
+   FREE(glhd_transfer);
+}
diff --git a/src/gallium/drivers/galahad/glhd_objects.h b/src/gallium/drivers/galahad/glhd_objects.h
new file mode 100644
index 0000000..9358039
--- /dev/null
+++ b/src/gallium/drivers/galahad/glhd_objects.h
@@ -0,0 +1,175 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef GLHD_OBJECTS_H
+#define GLHD_OBJECTS_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#include "glhd_screen.h"
+
+struct galahad_context;
+
+
+struct galahad_resource
+{
+   struct pipe_resource base;
+
+   struct pipe_resource *resource;
+};
+
+
+struct galahad_sampler_view
+{
+   struct pipe_sampler_view base;
+
+   struct pipe_sampler_view *sampler_view;
+};
+
+
+struct galahad_surface
+{
+   struct pipe_surface base;
+
+   struct pipe_surface *surface;
+};
+
+
+struct galahad_transfer
+{
+   struct pipe_transfer base;
+
+   struct pipe_transfer *transfer;
+};
+
+
+static INLINE struct galahad_resource *
+galahad_resource(struct pipe_resource *_resource)
+{
+   if(!_resource)
+      return NULL;
+   (void)galahad_screen(_resource->screen);
+   return (struct galahad_resource *)_resource;
+}
+
+static INLINE struct galahad_sampler_view *
+galahad_sampler_view(struct pipe_sampler_view *_sampler_view)
+{
+   if (!_sampler_view) {
+      return NULL;
+   }
+   return (struct galahad_sampler_view *)_sampler_view;
+}
+
+static INLINE struct galahad_surface *
+galahad_surface(struct pipe_surface *_surface)
+{
+   if(!_surface)
+      return NULL;
+   (void)galahad_resource(_surface->texture);
+   return (struct galahad_surface *)_surface;
+}
+
+static INLINE struct galahad_transfer *
+galahad_transfer(struct pipe_transfer *_transfer)
+{
+   if(!_transfer)
+      return NULL;
+   (void)galahad_resource(_transfer->resource);
+   return (struct galahad_transfer *)_transfer;
+}
+
+static INLINE struct pipe_resource *
+galahad_resource_unwrap(struct pipe_resource *_resource)
+{
+   if(!_resource)
+      return NULL;
+   return galahad_resource(_resource)->resource;
+}
+
+static INLINE struct pipe_sampler_view *
+galahad_sampler_view_unwrap(struct pipe_sampler_view *_sampler_view)
+{
+   if (!_sampler_view) {
+      return NULL;
+   }
+   return galahad_sampler_view(_sampler_view)->sampler_view;
+}
+
+static INLINE struct pipe_surface *
+galahad_surface_unwrap(struct pipe_surface *_surface)
+{
+   if(!_surface)
+      return NULL;
+   return galahad_surface(_surface)->surface;
+}
+
+static INLINE struct pipe_transfer *
+galahad_transfer_unwrap(struct pipe_transfer *_transfer)
+{
+   if(!_transfer)
+      return NULL;
+   return galahad_transfer(_transfer)->transfer;
+}
+
+
+struct pipe_resource *
+galahad_resource_create(struct galahad_screen *glhd_screen,
+                         struct pipe_resource *resource);
+
+void
+galahad_resource_destroy(struct galahad_resource *glhd_resource);
+
+struct pipe_surface *
+galahad_surface_create(struct galahad_resource *glhd_resource,
+                        struct pipe_surface *surface);
+
+void
+galahad_surface_destroy(struct galahad_surface *glhd_surface);
+
+struct pipe_sampler_view *
+galahad_sampler_view_create(struct galahad_context *glhd_context,
+                             struct galahad_resource *glhd_resource,
+                             struct pipe_sampler_view *view);
+
+void
+galahad_sampler_view_destroy(struct galahad_context *glhd_context,
+                              struct galahad_sampler_view *glhd_sampler_view);
+
+struct pipe_transfer *
+galahad_transfer_create(struct galahad_context *glhd_context,
+                         struct galahad_resource *glhd_resource,
+                         struct pipe_transfer *transfer);
+
+void
+galahad_transfer_destroy(struct galahad_context *glhd_context,
+                          struct galahad_transfer *glhd_transfer);
+
+
+#endif /* GLHD_OBJECTS_H */
diff --git a/src/gallium/drivers/galahad/glhd_public.h b/src/gallium/drivers/galahad/glhd_public.h
new file mode 100644
index 0000000..77a3801
--- /dev/null
+++ b/src/gallium/drivers/galahad/glhd_public.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef GLHD_PUBLIC_H
+#define GLHD_PUBLIC_H
+
+struct pipe_screen;
+struct pipe_context;
+
+struct pipe_screen *
+galahad_screen_create(struct pipe_screen *screen);
+
+#endif /* GLHD_PUBLIC_H */
diff --git a/src/gallium/drivers/galahad/glhd_screen.c b/src/gallium/drivers/galahad/glhd_screen.c
new file mode 100644
index 0000000..bcc37cb
--- /dev/null
+++ b/src/gallium/drivers/galahad/glhd_screen.c
@@ -0,0 +1,330 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * 2010 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "util/u_memory.h"
+
+#include "glhd_public.h"
+#include "glhd_screen.h"
+#include "glhd_context.h"
+#include "glhd_objects.h"
+
+
+static void
+galahad_screen_destroy(struct pipe_screen *_screen)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct pipe_screen *screen = glhd_screen->screen;
+
+   screen->destroy(screen);
+
+   FREE(glhd_screen);
+}
+
+static const char *
+galahad_screen_get_name(struct pipe_screen *_screen)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct pipe_screen *screen = glhd_screen->screen;
+
+   return screen->get_name(screen);
+}
+
+static const char *
+galahad_screen_get_vendor(struct pipe_screen *_screen)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct pipe_screen *screen = glhd_screen->screen;
+
+   return screen->get_vendor(screen);
+}
+
+static int
+galahad_screen_get_param(struct pipe_screen *_screen,
+                          enum pipe_cap param)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct pipe_screen *screen = glhd_screen->screen;
+
+   return screen->get_param(screen,
+                            param);
+}
+
+static float
+galahad_screen_get_paramf(struct pipe_screen *_screen,
+                           enum pipe_cap param)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct pipe_screen *screen = glhd_screen->screen;
+
+   return screen->get_paramf(screen,
+                             param);
+}
+
+static boolean
+galahad_screen_is_format_supported(struct pipe_screen *_screen,
+                                    enum pipe_format format,
+                                    enum pipe_texture_target target,
+                                    unsigned sample_count,
+                                    unsigned tex_usage,
+                                    unsigned geom_flags)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct pipe_screen *screen = glhd_screen->screen;
+
+   if (target >= PIPE_MAX_TEXTURE_TYPES) {
+      glhd_warn("Received bogus texture target %d", target);
+   }
+
+   return screen->is_format_supported(screen,
+                                      format,
+                                      target,
+                                      sample_count,
+                                      tex_usage,
+                                      geom_flags);
+}
+
+static struct pipe_context *
+galahad_screen_context_create(struct pipe_screen *_screen,
+                               void *priv)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct pipe_screen *screen = glhd_screen->screen;
+   struct pipe_context *result;
+
+   result = screen->context_create(screen, priv);
+   if (result)
+      return galahad_context_create(_screen, result);
+   return NULL;
+}
+
+static struct pipe_resource *
+galahad_screen_resource_create(struct pipe_screen *_screen,
+                                const struct pipe_resource *templat)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct pipe_screen *screen = glhd_screen->screen;
+   struct pipe_resource *result;
+
+   result = screen->resource_create(screen,
+                                    templat);
+
+   if (result)
+      return galahad_resource_create(glhd_screen, result);
+   return NULL;
+}
+
+static struct pipe_resource *
+galahad_screen_resource_from_handle(struct pipe_screen *_screen,
+                                     const struct pipe_resource *templ,
+                                     struct winsys_handle *handle)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct pipe_screen *screen = glhd_screen->screen;
+   struct pipe_resource *result;
+
+   /* TODO trace call */
+
+   result = screen->resource_from_handle(screen, templ, handle);
+
+   result = galahad_resource_create(galahad_screen(_screen), result);
+
+   return result;
+}
+
+static boolean
+galahad_screen_resource_get_handle(struct pipe_screen *_screen,
+                                    struct pipe_resource *_resource,
+                                    struct winsys_handle *handle)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct galahad_resource *glhd_resource = galahad_resource(_resource);
+   struct pipe_screen *screen = glhd_screen->screen;
+   struct pipe_resource *resource = glhd_resource->resource;
+
+   /* TODO trace call */
+
+   return screen->resource_get_handle(screen, resource, handle);
+}
+
+
+
+static void
+galahad_screen_resource_destroy(struct pipe_screen *screen,
+                                 struct pipe_resource *_resource)
+{
+   galahad_resource_destroy(galahad_resource(_resource));
+}
+
+static struct pipe_surface *
+galahad_screen_get_tex_surface(struct pipe_screen *_screen,
+                                struct pipe_resource *_resource,
+                                unsigned face,
+                                unsigned level,
+                                unsigned zslice,
+                                unsigned usage)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct galahad_resource *glhd_resource = galahad_resource(_resource);
+   struct pipe_screen *screen = glhd_screen->screen;
+   struct pipe_resource *resource = glhd_resource->resource;
+   struct pipe_surface *result;
+
+   result = screen->get_tex_surface(screen,
+                                    resource,
+                                    face,
+                                    level,
+                                    zslice,
+                                    usage);
+
+   if (result)
+      return galahad_surface_create(glhd_resource, result);
+   return NULL;
+}
+
+static void
+galahad_screen_tex_surface_destroy(struct pipe_surface *_surface)
+{
+   galahad_surface_destroy(galahad_surface(_surface));
+}
+
+
+
+static struct pipe_resource *
+galahad_screen_user_buffer_create(struct pipe_screen *_screen,
+                                   void *ptr,
+                                   unsigned bytes,
+                                   unsigned usage)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct pipe_screen *screen = glhd_screen->screen;
+   struct pipe_resource *result;
+
+   result = screen->user_buffer_create(screen,
+                                       ptr,
+                                       bytes,
+                                       usage);
+
+   if (result)
+      return galahad_resource_create(glhd_screen, result);
+   return NULL;
+}
+
+
+
+static void
+galahad_screen_flush_frontbuffer(struct pipe_screen *_screen,
+                                  struct pipe_surface *_surface,
+                                  void *context_private)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct galahad_surface *glhd_surface = galahad_surface(_surface);
+   struct pipe_screen *screen = glhd_screen->screen;
+   struct pipe_surface *surface = glhd_surface->surface;
+
+   screen->flush_frontbuffer(screen,
+                             surface,
+                             context_private);
+}
+
+static void
+galahad_screen_fence_reference(struct pipe_screen *_screen,
+                                struct pipe_fence_handle **ptr,
+                                struct pipe_fence_handle *fence)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct pipe_screen *screen = glhd_screen->screen;
+
+   screen->fence_reference(screen,
+                           ptr,
+                           fence);
+}
+
+static int
+galahad_screen_fence_signalled(struct pipe_screen *_screen,
+                                struct pipe_fence_handle *fence,
+                                unsigned flags)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct pipe_screen *screen = glhd_screen->screen;
+
+   return screen->fence_signalled(screen,
+                                  fence,
+                                  flags);
+}
+
+static int
+galahad_screen_fence_finish(struct pipe_screen *_screen,
+                             struct pipe_fence_handle *fence,
+                             unsigned flags)
+{
+   struct galahad_screen *glhd_screen = galahad_screen(_screen);
+   struct pipe_screen *screen = glhd_screen->screen;
+
+   return screen->fence_finish(screen,
+                               fence,
+                               flags);
+}
+
+struct pipe_screen *
+galahad_screen_create(struct pipe_screen *screen)
+{
+   struct galahad_screen *glhd_screen;
+
+   glhd_screen = CALLOC_STRUCT(galahad_screen);
+   if (!glhd_screen) {
+      return NULL;
+   }
+
+   glhd_screen->base.winsys = NULL;
+
+   glhd_screen->base.destroy = galahad_screen_destroy;
+   glhd_screen->base.get_name = galahad_screen_get_name;
+   glhd_screen->base.get_vendor = galahad_screen_get_vendor;
+   glhd_screen->base.get_param = galahad_screen_get_param;
+   glhd_screen->base.get_paramf = galahad_screen_get_paramf;
+   glhd_screen->base.is_format_supported = galahad_screen_is_format_supported;
+   glhd_screen->base.context_create = galahad_screen_context_create;
+   glhd_screen->base.resource_create = galahad_screen_resource_create;
+   glhd_screen->base.resource_from_handle = galahad_screen_resource_from_handle;
+   glhd_screen->base.resource_get_handle = galahad_screen_resource_get_handle;
+   glhd_screen->base.resource_destroy = galahad_screen_resource_destroy;
+   glhd_screen->base.get_tex_surface = galahad_screen_get_tex_surface;
+   glhd_screen->base.tex_surface_destroy = galahad_screen_tex_surface_destroy;
+   glhd_screen->base.user_buffer_create = galahad_screen_user_buffer_create;
+   glhd_screen->base.flush_frontbuffer = galahad_screen_flush_frontbuffer;
+   glhd_screen->base.fence_reference = galahad_screen_fence_reference;
+   glhd_screen->base.fence_signalled = galahad_screen_fence_signalled;
+   glhd_screen->base.fence_finish = galahad_screen_fence_finish;
+
+   glhd_screen->screen = screen;
+
+   return &glhd_screen->base;
+}
diff --git a/src/gallium/drivers/galahad/glhd_screen.h b/src/gallium/drivers/galahad/glhd_screen.h
new file mode 100644
index 0000000..7862f4a
--- /dev/null
+++ b/src/gallium/drivers/galahad/glhd_screen.h
@@ -0,0 +1,48 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef GLHD_SCREEN_H
+#define GLHD_SCREEN_H
+
+#include "pipe/p_screen.h"
+#include "pipe/p_defines.h"
+
+
+struct galahad_screen {
+   struct pipe_screen base;
+
+   struct pipe_screen *screen;
+};
+
+
+static INLINE struct galahad_screen *
+galahad_screen(struct pipe_screen *screen)
+{
+   return (struct galahad_screen *)screen;
+}
+
+#endif /* GLHD_SCREEN_H */
diff --git a/src/gallium/drivers/i915/SConscript b/src/gallium/drivers/i915/SConscript
index 7b69681..d6e7a8d 100644
--- a/src/gallium/drivers/i915/SConscript
+++ b/src/gallium/drivers/i915/SConscript
@@ -2,6 +2,10 @@
 
 env = env.Clone()
 
+if msvc:
+	print 'warning: not building i915g'
+	Return()
+
 i915 = env.ConvenienceLibrary(
 	target = 'i915',
 	source = [
diff --git a/src/gallium/drivers/i915/i915_blit.c b/src/gallium/drivers/i915/i915_blit.c
index c5b5979..0a1b3e0 100644
--- a/src/gallium/drivers/i915/i915_blit.c
+++ b/src/gallium/drivers/i915/i915_blit.c
@@ -31,7 +31,6 @@
 #include "i915_batch.h"
 #include "i915_debug.h"
 
-#define FILE_DEBUG_FLAG DEBUG_BLIT
 
 void
 i915_fill_blit(struct i915_context *i915,
@@ -47,10 +46,8 @@
    unsigned BR13, CMD;
 
 
-   I915_DBG(i915,
-      "%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
-      __FUNCTION__,
-      dst_buffer, dst_pitch, dst_offset, x, y, w, h);
+   I915_DBG(DBG_BLIT, "%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+            __FUNCTION__, dst_buffer, dst_pitch, dst_offset, x, y, w, h);
 
    switch (cpp) {
    case 1:
@@ -100,11 +97,11 @@
    int dst_x2 = dst_x + w;
 
 
-   I915_DBG(i915,
-      "%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
-      __FUNCTION__,
-      src_buffer, src_pitch, src_offset, src_x, src_y,
-      dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);
+   I915_DBG(DBG_BLIT,
+            "%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+            __FUNCTION__,
+            src_buffer, src_pitch, src_offset, src_x, src_y,
+            dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);
 
    switch (cpp) {
    case 1:
diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h
index acc0ffe..ac02ab2 100644
--- a/src/gallium/drivers/i915/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -237,8 +237,6 @@
 
    struct i915_state current;
    unsigned hardware_dirty;
-   
-   unsigned debug;
 };
 
 /* A flag for each state_tracker state object:
diff --git a/src/gallium/drivers/i915/i915_debug.c b/src/gallium/drivers/i915/i915_debug.c
index 663fac3..57d3390 100644
--- a/src/gallium/drivers/i915/i915_debug.c
+++ b/src/gallium/drivers/i915/i915_debug.c
@@ -27,11 +27,37 @@
 
 #include "i915_reg.h"
 #include "i915_context.h"
+#include "i915_screen.h"
 #include "i915_debug.h"
+#include "i915_debug_private.h"
 #include "i915_batch.h"
 #include "util/u_debug.h"
 
 
+
+static const struct debug_named_value debug_options[] = {
+   {"blit",      DBG_BLIT,      "Print when using the 2d blitter"},
+   {"emit",      DBG_EMIT,      "State emit information"},
+   {"atoms",     DBG_ATOMS,     "Print dirty state atoms"},
+   {"flush",     DBG_FLUSH,     "Flushing information"},
+   {"texture",   DBG_TEXTURE,   "Texture information"},
+   {"constants", DBG_CONSTANTS, "Constant buffers"},
+   DEBUG_NAMED_VALUE_END
+};
+
+unsigned i915_debug = 0;
+
+void i915_debug_init(struct i915_screen *screen)
+{
+   i915_debug = debug_get_flags_option("I915_DEBUG", debug_options, 0);
+}
+
+
+
+/***********************************************************************
+ * Batchbuffer dumping
+ */
+
 static void
 PRINTF(
    struct debug_stream  *stream,
@@ -896,3 +922,66 @@
 }
 
 
+
+/***********************************************************************
+ * Dirty state atom dumping
+ */
+
+void
+i915_dump_dirty(struct i915_context *i915, const char *func)
+{
+   struct {
+      unsigned dirty;
+      const char *name;
+   } l[] = {
+      {I915_NEW_VIEWPORT,      "viewport"},
+      {I915_NEW_RASTERIZER,    "rasterizer"},
+      {I915_NEW_FS,            "fs"},
+      {I915_NEW_BLEND,         "blend"},
+      {I915_NEW_CLIP,          "clip"},
+      {I915_NEW_SCISSOR,       "scissor"},
+      {I915_NEW_STIPPLE,       "stipple"},
+      {I915_NEW_FRAMEBUFFER,   "framebuffer"},
+      {I915_NEW_ALPHA_TEST,    "alpha_test"},
+      {I915_NEW_DEPTH_STENCIL, "depth_stencil"},
+      {I915_NEW_SAMPLER,       "sampler"},
+      {I915_NEW_SAMPLER_VIEW,  "sampler_view"},
+      {I915_NEW_CONSTANTS,     "constants"},
+      {I915_NEW_VBO,           "vbo"},
+      {I915_NEW_VS,            "vs"},
+      {0, NULL},
+   };
+   int i;
+
+   debug_printf("%s: ", func);
+   for (i = 0; l[i].name; i++)
+      if (i915->dirty & l[i].dirty)
+         debug_printf("%s ", l[i].name);
+   debug_printf("\n");
+}
+
+void
+i915_dump_hardware_dirty(struct i915_context *i915, const char *func)
+{
+   struct {
+      unsigned dirty;
+      const char *name;
+   } l[] = {
+      {I915_HW_STATIC,    "static"},
+      {I915_HW_DYNAMIC,   "dynamic"},
+      {I915_HW_SAMPLER,   "sampler"},
+      {I915_HW_MAP,       "map"},
+      {I915_HW_PROGRAM,   "program"},
+      {I915_HW_CONSTANTS, "constants"},
+      {I915_HW_IMMEDIATE, "immediate"},
+      {I915_HW_INVARIENT, "invarient"},
+      {0, NULL},
+   };
+   int i;
+
+   debug_printf("%s: ", func);
+   for (i = 0; l[i].name; i++)
+      if (i915->hardware_dirty & l[i].dirty)
+         debug_printf("%s ", l[i].name);
+   debug_printf("\n");
+}
diff --git a/src/gallium/drivers/i915/i915_debug.h b/src/gallium/drivers/i915/i915_debug.h
index 67b8d9c..8aa09f9 100644
--- a/src/gallium/drivers/i915/i915_debug.h
+++ b/src/gallium/drivers/i915/i915_debug.h
@@ -26,89 +26,51 @@
  **************************************************************************/
 
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ *           Jakob Bornecrantz <wallbraker@gmail.com>
  */
 
 #ifndef I915_DEBUG_H
 #define I915_DEBUG_H
 
-#include <stdarg.h>
+#include "util/u_debug.h"
 
+struct i915_screen;
 struct i915_context;
+struct i915_winsys_batchbuffer;
 
-struct debug_stream 
+#define DBG_BLIT      0x1
+#define DBG_EMIT      0x2
+#define DBG_ATOMS     0x4
+#define DBG_FLUSH     0x8
+#define DBG_TEXTURE   0x10
+#define DBG_CONSTANTS 0x20
+
+extern unsigned i915_debug;
+
+static INLINE boolean
+I915_DBG_ON(unsigned flags)
 {
-   unsigned offset;		/* current gtt offset */
-   char *ptr;		/* pointer to gtt offset zero */
-   char *end;		/* pointer to gtt offset zero */
-   unsigned print_addresses;
-};
-
-
-/* Internal functions
- */
-void i915_disassemble_program(struct debug_stream *stream, 
-			      const unsigned *program, unsigned sz);
-
-void i915_print_ureg(const char *msg, unsigned ureg);
-
-
-#define DEBUG_BATCH	 0x1
-#define DEBUG_BLIT       0x2
-#define DEBUG_BUFFER     0x4
-#define DEBUG_CONSTANTS  0x8
-#define DEBUG_CONTEXT    0x10
-#define DEBUG_DRAW	 0x20
-#define DEBUG_DYNAMIC	 0x40
-#define DEBUG_FLUSH      0x80
-#define DEBUG_MAP	 0x100
-#define DEBUG_PROGRAM	 0x200
-#define DEBUG_REGIONS    0x400
-#define DEBUG_SAMPLER	 0x800
-#define DEBUG_STATIC	 0x1000
-#define DEBUG_SURFACE    0x2000
-#define DEBUG_WINSYS     0x4000
-
-#include "pipe/p_compiler.h"
-
-#if defined(DEBUG) && defined(FILE_DEBUG_FLAG)
-
-#include "util/u_simple_screen.h"
+   return i915_debug & flags;
+}
 
 static INLINE void
-I915_DBG(
-   struct i915_context  *i915,
-   const char           *fmt,
-                        ... )
+I915_DBG(unsigned flags, const char *fmt, ...)
 {
-   if ((i915)->debug & FILE_DEBUG_FLAG) {
+   if (I915_DBG_ON(flags)) {
       va_list  args;
 
-      va_start( args, fmt );
-      debug_vprintf( fmt, args );
-      va_end( args );
+      va_start(args, fmt);
+      debug_vprintf(fmt, args);
+      va_end(args);
    }
 }
 
-#else
+void i915_debug_init(struct i915_screen *i915);
 
-static INLINE void
-I915_DBG(
-   struct i915_context  *i915,
-   const char           *fmt,
-                        ... )
-{
-   (void) i915;
-   (void) fmt;
-}
+void i915_dump_batchbuffer(struct i915_winsys_batchbuffer *i915);
 
-#endif
+void i915_dump_dirty(struct i915_context *i915, const char *func);
 
-
-struct i915_winsys_batchbuffer;
-
-void i915_dump_batchbuffer( struct i915_winsys_batchbuffer *i915 );
-
-void i915_debug_init( struct i915_context *i915 );
-
+void i915_dump_hardware_dirty(struct i915_context *i915, const char *func);
 
 #endif
diff --git a/src/gallium/drivers/i915/i915_debug_fp.c b/src/gallium/drivers/i915/i915_debug_fp.c
index f41c51f..50f49c5 100644
--- a/src/gallium/drivers/i915/i915_debug_fp.c
+++ b/src/gallium/drivers/i915/i915_debug_fp.c
@@ -28,6 +28,7 @@
 
 #include "i915_reg.h"
 #include "i915_debug.h"
+#include "i915_debug_private.h"
 #include "util/u_debug.h"
 
 
diff --git a/src/gallium/drivers/i915/i915_debug_private.h b/src/gallium/drivers/i915/i915_debug_private.h
new file mode 100644
index 0000000..b3668d0
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_debug_private.h
@@ -0,0 +1,45 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef I915_DEBUG_PRIVATE_H
+#define I915_DEBUG_PRIVATE_H
+
+struct debug_stream 
+{
+   unsigned offset;		/* current gtt offset */
+   char *ptr;		/* pointer to gtt offset zero */
+   char *end;		/* pointer to gtt offset zero */
+   unsigned print_addresses;
+};
+
+void i915_disassemble_program(struct debug_stream *stream, 
+			      const unsigned *program, unsigned sz);
+
+#endif
diff --git a/src/gallium/drivers/i915/i915_flush.c b/src/gallium/drivers/i915/i915_flush.c
index 1582168..9671464 100644
--- a/src/gallium/drivers/i915/i915_flush.c
+++ b/src/gallium/drivers/i915/i915_flush.c
@@ -35,6 +35,7 @@
 #include "i915_context.h"
 #include "i915_reg.h"
 #include "i915_batch.h"
+#include "i915_debug.h"
 
 
 static void i915_flush( struct pipe_context *pipe,
@@ -76,10 +77,10 @@
     */
    FLUSH_BATCH(fence);
    i915->vbo_flushed = 1;
+
+   I915_DBG(DBG_FLUSH, "%s: #####\n", __FUNCTION__);
 }
 
-
-
 void i915_init_flush_functions( struct i915_context *i915 )
 {
    i915->base.flush = i915_flush;
diff --git a/src/gallium/drivers/i915/i915_prim_vbuf.c b/src/gallium/drivers/i915/i915_prim_vbuf.c
index f8665ac..bd046bd 100644
--- a/src/gallium/drivers/i915/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915/i915_prim_vbuf.c
@@ -52,8 +52,7 @@
 #include "i915_state.h"
 
 
-#undef VBUF_USE_FIFO
-#undef VBUF_MAP_BUFFER
+#define VBUF_MAP_BUFFER
 
 /**
  * Primitive renderer for i915.
@@ -79,23 +78,18 @@
    struct i915_winsys_buffer *vbo;
    size_t vbo_size; /**< current size of allocated buffer */
    size_t vbo_alloc_size; /**< minimum buffer size to allocate */
-   size_t vbo_offset;
+   size_t vbo_hw_offset; /**< offset that we program the hardware with */
+   size_t vbo_sw_offset; /**< offset that we work with */
+   size_t vbo_index; /**< index offset to be added to all indices */
    void *vbo_ptr;
    size_t vbo_max_used;
+   size_t vbo_max_index; /**< index offset to be added to all indices */
 
 #ifndef VBUF_MAP_BUFFER
    size_t map_used_start;
    size_t map_used_end;
    size_t map_size;
 #endif
-
-#ifdef VBUF_USE_FIFO
-   /* Stuff for the pool */
-   struct util_fifo *pool_fifo;
-   unsigned pool_used;
-   unsigned pool_buffer_size;
-   boolean pool_not_used;
-#endif
 };
 
 
@@ -109,6 +103,35 @@
    return (struct i915_vbuf_render *)render;
 }
 
+/**
+ * If vbo state differs between renderer and context
+ * push state to the context. This function pushes
+ * hw_offset to i915->vbo_offset and vbo to i915->vbo.
+ *
+ * Side effects:
+ *    May updates context vbo_offset and vbo fields.
+ */
+static void
+i915_vbuf_update_vbo_state(struct vbuf_render *render)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+
+   if (i915->vbo != i915_render->vbo ||
+       i915->vbo_offset != i915_render->vbo_hw_offset) {
+      i915->vbo = i915_render->vbo;
+      i915->vbo_offset = i915_render->vbo_hw_offset;
+      i915->dirty |= I915_NEW_VBO;
+   }
+}
+
+/**
+ * Callback exported to the draw module.
+ * Returns the current vertex_info.
+ *
+ * Side effects:
+ *    If state is dirty update derived state.
+ */
 static const struct vertex_info *
 i915_vbuf_render_get_vertex_info(struct vbuf_render *render)
 {
@@ -123,12 +146,18 @@
    return &i915->current.vertex_info;
 }
 
+/**
+ * Reserve space in the vbo for vertices.
+ *
+ * Side effects:
+ *    None.
+ */
 static boolean
 i915_vbuf_render_reserve(struct i915_vbuf_render *i915_render, size_t size)
 {
    struct i915_context *i915 = i915_render->i915;
 
-   if (i915_render->vbo_size < size + i915_render->vbo_offset)
+   if (i915_render->vbo_size < size + i915_render->vbo_sw_offset)
       return FALSE;
 
    if (i915->vbo_flushed)
@@ -137,28 +166,28 @@
    return TRUE;
 }
 
+/**
+ * Allocate a new vbo buffer should there not be enough space for
+ * the requested number of vertices by the draw module.
+ *
+ * Side effects:
+ *    Updates hw_offset, sw_offset, index and allocates a new buffer.
+ */
 static void
 i915_vbuf_render_new_buf(struct i915_vbuf_render *i915_render, size_t size)
 {
    struct i915_context *i915 = i915_render->i915;
    struct i915_winsys *iws = i915->iws;
 
-   if (i915_render->vbo) {
-#ifdef VBUF_USE_FIFO
-      if (i915_render->pool_not_used)
-         iws->buffer_destroy(iws, i915_render->vbo);
-      else
-         u_fifo_add(i915_render->pool_fifo, i915_render->vbo);
-      i915_render->vbo = NULL;
-#else
+   if (i915_render->vbo)
       iws->buffer_destroy(iws, i915_render->vbo);
-#endif
-   }
 
    i915->vbo_flushed = 0;
 
    i915_render->vbo_size = MAX2(size, i915_render->vbo_alloc_size);
-   i915_render->vbo_offset = 0;
+   i915_render->vbo_hw_offset = 0;
+   i915_render->vbo_sw_offset = 0;
+   i915_render->vbo_index = 0;
 
 #ifndef VBUF_MAP_BUFFER
    if (i915_render->vbo_size > i915_render->map_size) {
@@ -168,52 +197,51 @@
    }
 #endif
 
-#ifdef VBUF_USE_FIFO
-   if (i915_render->vbo_size != i915_render->pool_buffer_size) {
-      i915_render->pool_not_used = TRUE;
-      i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size, 64,
-            I915_NEW_VERTEX);
-   } else {
-      i915_render->pool_not_used = FALSE;
-
-      if (i915_render->pool_used >= 2) {
-         FLUSH_BATCH(NULL);
-         i915->vbo_flushed = 0;
-         i915_render->pool_used = 0;
-      }
-      u_fifo_pop(i915_render->pool_fifo, (void**)&i915_render->vbo);
-   }
-#else
    i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size,
                                          64, I915_NEW_VERTEX);
-#endif
 }
 
+/**
+ * Callback exported to the draw module.
+ *
+ * Side effects:
+ *    Updates hw_offset, sw_offset, index and may allocate
+ *    a new buffer. Also updates may update the vbo state
+ *    on the i915 context.
+ */
 static boolean
 i915_vbuf_render_allocate_vertices(struct vbuf_render *render,
                                    ushort vertex_size,
                                    ushort nr_vertices)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
-   struct i915_context *i915 = i915_render->i915;
    size_t size = (size_t)vertex_size * (size_t)nr_vertices;
+   size_t offset;
 
-   /* FIXME: handle failure */
-   assert(!i915->vbo);
-
-   if (!i915_vbuf_render_reserve(i915_render, size)) {
-#ifdef VBUF_USE_FIFO
-      /* incase we flushed reset the number of pool buffers used */
-      if (i915->vbo_flushed)
-         i915_render->pool_used = 0;
-#endif
-      i915_vbuf_render_new_buf(i915_render, size);
+   /*
+    * Align sw_offset with first multiple of vertex size from hw_offset.
+    * Set index to be the multiples from from hw_offset to sw_offset.
+    * i915_vbuf_render_new_buf will reset index, sw_offset, hw_offset
+    * when it allocates a new buffer this is correct.
+    */
+   {
+      offset = i915_render->vbo_sw_offset - i915_render->vbo_hw_offset;
+      offset = util_align_npot(offset, vertex_size);
+      i915_render->vbo_sw_offset = i915_render->vbo_hw_offset + offset;
+      i915_render->vbo_index = offset / vertex_size;
    }
 
+   if (!i915_vbuf_render_reserve(i915_render, size))
+      i915_vbuf_render_new_buf(i915_render, size);
+
+   /*
+    * If a new buffer has been alocated sw_offset,
+    * hw_offset & index will be reset by new_buf
+    */
+
    i915_render->vertex_size = vertex_size;
-   i915->vbo = i915_render->vbo;
-   i915->vbo_offset = i915_render->vbo_offset;
-   i915->dirty |= I915_NEW_VBO;
+
+   i915_vbuf_update_vbo_state(render);
 
    if (!i915_render->vbo)
       return FALSE;
@@ -232,7 +260,7 @@
 
 #ifdef VBUF_MAP_BUFFER
    i915_render->vbo_ptr = iws->buffer_map(iws, i915_render->vbo, TRUE);
-   return (unsigned char *)i915_render->vbo_ptr + i915_render->vbo_offset;
+   return (unsigned char *)i915_render->vbo_ptr + i915_render->vbo_sw_offset;
 #else
    (void)iws;
    return (unsigned char *)i915_render->vbo_ptr;
@@ -248,6 +276,7 @@
    struct i915_context *i915 = i915_render->i915;
    struct i915_winsys *iws = i915->iws;
 
+   i915_render->vbo_max_index = max_index;
    i915_render->vbo_max_used = MAX2(i915_render->vbo_max_used, i915_render->vertex_size * (max_index + 1));
 #ifdef VBUF_MAP_BUFFER
    iws->buffer_unmap(iws, i915_render->vbo);
@@ -255,13 +284,36 @@
    i915_render->map_used_start = i915_render->vertex_size * min_index;
    i915_render->map_used_end = i915_render->vertex_size * (max_index + 1);
    iws->buffer_write(iws, i915_render->vbo,
-                     i915_render->map_used_start + i915_render->vbo_offset,
+                     i915_render->map_used_start + i915_render->vbo_sw_offset,
                      i915_render->map_used_end - i915_render->map_used_start,
                      (unsigned char *)i915_render->vbo_ptr + i915_render->map_used_start);
 
 #endif
 }
 
+/**
+ * Ensure that the given max_index given is not larger ushort max.
+ * If it is larger then ushort max it advanced the hw_offset to the
+ * same position in the vbo as sw_offset and set index to zero.
+ *
+ * Side effects:
+ *    On failure update hw_offset and index.
+ */
+static void
+i915_vbuf_ensure_index_bounds(struct vbuf_render *render,
+                              unsigned max_index)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+
+   if (max_index + i915_render->vbo_index < ((1 << 17) - 1))
+      return;
+
+   i915_render->vbo_hw_offset = i915_render->vbo_sw_offset;
+   i915_render->vbo_index = 0;
+
+   i915_vbuf_update_vbo_state(render);
+}
+
 static boolean
 i915_vbuf_render_set_primitive(struct vbuf_render *render, 
                                unsigned prim)
@@ -327,7 +379,9 @@
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
    unsigned i;
-   unsigned end = start + nr;
+   unsigned end = start + nr + i915_render->vbo_index;
+   start += i915_render->vbo_index;
+
    switch(type) {
    case 0:
       for (i = start; i+1 < end; i += 2)
@@ -391,16 +445,18 @@
    struct i915_context *i915 = i915_render->i915;
    unsigned nr_indices;
 
+   nr_indices = draw_arrays_calc_nr_indices(nr, i915_render->fallback);
+   if (!nr_indices)
+      return;
+
+   i915_vbuf_ensure_index_bounds(render, start + nr_indices);
+
    if (i915->dirty)
       i915_update_derived(i915);
 
    if (i915->hardware_dirty)
       i915_emit_hardware_state(i915);
 
-   nr_indices = draw_arrays_calc_nr_indices(nr, i915_render->fallback);
-   if (!nr_indices)
-      return;
-
    if (!BEGIN_BATCH(1 + (nr_indices + 1)/2, 1)) {
       FLUSH_BATCH(NULL);
 
@@ -415,6 +471,7 @@
          goto out;
       }
    }
+
    OUT_BATCH(_3DPRIMITIVE |
              PRIM_INDIRECT |
              i915_render->hwprim |
@@ -440,6 +497,9 @@
       return;
    }
 
+   i915_vbuf_ensure_index_bounds(render, start + nr);
+   start += i915_render->vbo_index;
+
    if (i915->dirty)
       i915_update_derived(i915);
 
@@ -485,35 +545,36 @@
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
    unsigned i;
+   unsigned o = i915_render->vbo_index;
 
    switch(type) {
    case 0:
       for (i = 0; i + 1 < nr_indices; i += 2) {
-         OUT_BATCH(indices[i] | indices[i+1] << 16);
+         OUT_BATCH((o+indices[i]) | (o+indices[i+1]) << 16);
       }
       if (i < nr_indices) {
-         OUT_BATCH(indices[i]);
+         OUT_BATCH((o+indices[i]));
       }
       break;
    case PIPE_PRIM_LINE_LOOP:
       if (nr_indices >= 2) {
          for (i = 1; i < nr_indices; i++)
-            OUT_BATCH(indices[i-1] | indices[i] << 16);
-         OUT_BATCH(indices[i-1] | indices[0] << 16);
+            OUT_BATCH((o+indices[i-1]) | (o+indices[i]) << 16);
+         OUT_BATCH((o+indices[i-1]) | (o+indices[0]) << 16);
       }
       break;
    case PIPE_PRIM_QUADS:
       for (i = 0; i + 3 < nr_indices; i += 4) {
-         OUT_BATCH(indices[i+0] | indices[i+1] << 16);
-         OUT_BATCH(indices[i+3] | indices[i+1] << 16);
-         OUT_BATCH(indices[i+2] | indices[i+3] << 16);
+         OUT_BATCH((o+indices[i+0]) | (o+indices[i+1]) << 16);
+         OUT_BATCH((o+indices[i+3]) | (o+indices[i+1]) << 16);
+         OUT_BATCH((o+indices[i+2]) | (o+indices[i+3]) << 16);
       }
       break;
    case PIPE_PRIM_QUAD_STRIP:
       for (i = 0; i + 3 < nr_indices; i += 2) {
-         OUT_BATCH(indices[i+0] | indices[i+1] << 16);
-         OUT_BATCH(indices[i+3] | indices[i+2] << 16);
-         OUT_BATCH(indices[i+0] | indices[i+3] << 16);
+         OUT_BATCH((o+indices[i+0]) | (o+indices[i+1]) << 16);
+         OUT_BATCH((o+indices[i+3]) | (o+indices[i+2]) << 16);
+         OUT_BATCH((o+indices[i+0]) | (o+indices[i+3]) << 16);
       }
       break;
    default:
@@ -558,6 +619,8 @@
    if (!nr_indices)
       return;
 
+   i915_vbuf_ensure_index_bounds(render, i915_render->vbo_max_index);
+
    if (i915->dirty)
       i915_update_derived(i915);
 
@@ -597,14 +660,15 @@
 i915_vbuf_render_release_vertices(struct vbuf_render *render)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
-   struct i915_context *i915 = i915_render->i915;
 
-   assert(i915->vbo);
-
-   i915_render->vbo_offset += i915_render->vbo_max_used;
+   i915_render->vbo_sw_offset += i915_render->vbo_max_used;
    i915_render->vbo_max_used = 0;
-   i915->vbo = NULL;
-   i915->dirty |= I915_NEW_VBO;
+
+   /*
+    * Micro optimization, by calling update here we the offset change
+    * will be picked up on the next pipe_context::draw_*.
+    */
+   i915_vbuf_update_vbo_state(render);
 }
 
 static void
@@ -652,7 +716,8 @@
    i915_render->vbo = NULL;
    i915_render->vbo_ptr = NULL;
    i915_render->vbo_size = 0;
-   i915_render->vbo_offset = 0;
+   i915_render->vbo_hw_offset = 0;
+   i915_render->vbo_sw_offset = 0;
    i915_render->vbo_alloc_size = i915_render->base.max_vertex_buffer_bytes * 4;
 
 #ifdef VBUF_USE_POOL
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index f824265..255538e 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -31,6 +31,7 @@
 #include "util/u_string.h"
 
 #include "i915_reg.h"
+#include "i915_debug.h"
 #include "i915_context.h"
 #include "i915_screen.h"
 #include "i915_surface.h"
@@ -330,5 +331,7 @@
    i915_init_screen_resource_functions(is);
    i915_init_screen_surface_functions(is);
 
+   i915_debug_init(is);
+
    return &is->base;
 }
diff --git a/src/gallium/drivers/i915/i915_state.h b/src/gallium/drivers/i915/i915_state.h
index 86c6b00..7795046 100644
--- a/src/gallium/drivers/i915/i915_state.h
+++ b/src/gallium/drivers/i915/i915_state.h
@@ -35,16 +35,21 @@
 
 
 struct i915_tracked_state {
+   const char *name;
+   void (*update)(struct i915_context *);
    unsigned dirty;
-   void (*update)( struct i915_context * );
 };
 
-void i915_update_immediate( struct i915_context *i915 );
-void i915_update_dynamic( struct i915_context *i915 );
-void i915_update_derived( struct i915_context *i915 );
-void i915_update_samplers( struct i915_context *i915 );
-void i915_update_textures(struct i915_context *i915);
+extern struct i915_tracked_state i915_update_vertex_layout;
 
-void i915_emit_hardware_state( struct i915_context *i915 );
+extern struct i915_tracked_state i915_hw_samplers;
+extern struct i915_tracked_state i915_hw_sampler_views;
+extern struct i915_tracked_state i915_hw_immediate;
+extern struct i915_tracked_state i915_hw_dynamic;
+extern struct i915_tracked_state i915_hw_fs;
+extern struct i915_tracked_state i915_hw_framebuffer;
+
+void i915_update_derived(struct i915_context *i915);
+void i915_emit_hardware_state(struct i915_context *i915);
 
 #endif
diff --git a/src/gallium/drivers/i915/i915_state_derived.c b/src/gallium/drivers/i915/i915_state_derived.c
index 4da4677..c059540 100644
--- a/src/gallium/drivers/i915/i915_state_derived.c
+++ b/src/gallium/drivers/i915/i915_state_derived.c
@@ -32,15 +32,16 @@
 #include "draw/draw_vertex.h"
 #include "i915_context.h"
 #include "i915_state.h"
+#include "i915_debug.h"
 #include "i915_reg.h"
 
 
 
-/**
+/***********************************************************************
  * Determine the hardware vertex layout.
  * Depends on vertex/fragment shader state.
  */
-static void calculate_vertex_layout( struct i915_context *i915 )
+static void calculate_vertex_layout(struct i915_context *i915)
 {
    const struct i915_fragment_shader *fs = i915->fs;
    const enum interp_mode colorInterp = i915->rasterizer->color_interp;
@@ -146,37 +147,71 @@
    }
 }
 
+struct i915_tracked_state i915_update_vertex_layout = {
+   "vertex_layout",
+   calculate_vertex_layout,
+   I915_NEW_RASTERIZER | I915_NEW_FS | I915_NEW_VS
+};
 
 
 
-/* Hopefully this will remain quite simple, otherwise need to pull in
- * something like the state tracker mechanism.
+/***********************************************************************
+ * Update fragment state
  */
-void i915_update_derived( struct i915_context *i915 )
+static void update_fs(struct i915_context *i915)
 {
-   if (i915->dirty & (I915_NEW_RASTERIZER | I915_NEW_FS | I915_NEW_VS))
-      calculate_vertex_layout( i915 );
+   i915->hardware_dirty |= I915_HW_PROGRAM; /* XXX right? */
+}
 
-   if (i915->dirty & (I915_NEW_SAMPLER | I915_NEW_SAMPLER_VIEW))
-      i915_update_samplers(i915);
+struct i915_tracked_state i915_hw_fs = {
+   "fs",
+   update_fs,
+   I915_NEW_FS
+};
 
-   if (i915->dirty & I915_NEW_SAMPLER_VIEW)
-      i915_update_textures(i915);
 
-   if (i915->dirty)
-      i915_update_immediate( i915 );
 
-   if (i915->dirty)
-      i915_update_dynamic( i915 );
-
-   if (i915->dirty & I915_NEW_FS) {
-      i915->hardware_dirty |= I915_HW_PROGRAM; /* XXX right? */
-   }
-
+/***********************************************************************
+ * Update framebuffer state
+ */
+static void update_framebuffer(struct i915_context *i915)
+{
    /* HW emit currently references framebuffer state directly:
     */
-   if (i915->dirty & I915_NEW_FRAMEBUFFER)
-      i915->hardware_dirty |= I915_HW_STATIC;
+   i915->hardware_dirty |= I915_HW_STATIC;
+}
+
+struct i915_tracked_state i915_hw_framebuffer = {
+   "framebuffer",
+   update_framebuffer,
+   I915_NEW_FRAMEBUFFER
+};
+
+
+
+/***********************************************************************
+ */
+static struct i915_tracked_state *atoms[] = {
+   &i915_update_vertex_layout,
+   &i915_hw_samplers,
+   &i915_hw_sampler_views,
+   &i915_hw_immediate,
+   &i915_hw_dynamic,
+   &i915_hw_fs,
+   &i915_hw_framebuffer,
+   NULL,
+};
+
+void i915_update_derived(struct i915_context *i915)
+{
+   int i;
+
+   if (I915_DBG_ON(DBG_ATOMS))
+      i915_dump_dirty(i915, __FUNCTION__);
+
+   for (i = 0; atoms[i]; i++)
+      if (atoms[i]->dirty & i915->dirty)
+         atoms[i]->update(i915);
 
    i915->dirty = 0;
 }
diff --git a/src/gallium/drivers/i915/i915_state_dynamic.c b/src/gallium/drivers/i915/i915_state_dynamic.c
index 9c6723b..d964483 100644
--- a/src/gallium/drivers/i915/i915_state_dynamic.c
+++ b/src/gallium/drivers/i915/i915_state_dynamic.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 #include "i915_batch.h"
@@ -34,10 +34,9 @@
 #include "util/u_memory.h"
 #include "util/u_pack_color.h"
 
-#define FILE_DEBUG_FLAG DEBUG_STATE
 
 /* State that we have chosen to store in the DYNAMIC segment of the
- * i915 indirect state mechanism.  
+ * i915 indirect state mechanism.
  *
  * Can't cache these in the way we do the static state, as there is no
  * start/size in the command packet, instead an 'end' value that gets
@@ -47,10 +46,10 @@
  * (active) state every time a 4kb boundary is crossed.
  */
 
-static INLINE void set_dynamic_indirect( struct i915_context *i915,
-					 unsigned offset,
-					 const unsigned *src,
-					 unsigned dwords )
+static INLINE void set_dynamic_indirect(struct i915_context *i915,
+                                        unsigned offset,
+                                        const unsigned *src,
+                                        unsigned dwords)
 {
    unsigned i;
 
@@ -61,38 +60,41 @@
 }
 
 
+
 /***********************************************************************
- * Modes4: stencil masks and logicop 
+ * Modes4: stencil masks and logicop
  */
-static void upload_MODES4( struct i915_context *i915 )
+static void upload_MODES4(struct i915_context *i915)
 {
    unsigned modes4 = 0;
 
-   /* I915_NEW_STENCIL */
+   /* I915_NEW_STENCIL
+    */
    modes4 |= i915->depth_stencil->stencil_modes4;
-   /* I915_NEW_BLEND */
+
+   /* I915_NEW_BLEND
+     */
    modes4 |= i915->blend->modes4;
 
-   /* Always, so that we know when state is in-active: 
+   /* Always, so that we know when state is in-active:
     */
-   set_dynamic_indirect( i915, 
-			 I915_DYNAMIC_MODES4,
-			 &modes4,
-			 1 );
+   set_dynamic_indirect(i915,
+                        I915_DYNAMIC_MODES4,
+                        &modes4,
+                        1);
 }
 
 const struct i915_tracked_state i915_upload_MODES4 = {
-   I915_NEW_BLEND | I915_NEW_DEPTH_STENCIL,
-   upload_MODES4
+   "MODES4",
+   upload_MODES4,
+   I915_NEW_BLEND | I915_NEW_DEPTH_STENCIL
 };
 
 
 
-
 /***********************************************************************
  */
-
-static void upload_BFO( struct i915_context *i915 )
+static void upload_BFO(struct i915_context *i915)
 {
    unsigned bfo[2];
    bfo[0] = i915->depth_stencil->bfo[0];
@@ -101,88 +103,89 @@
    if (bfo[0] & BFO_ENABLE_STENCIL_REF) {
       bfo[0] |= i915->stencil_ref.ref_value[1] << BFO_STENCIL_REF_SHIFT;
    }
-   set_dynamic_indirect( i915,
-			 I915_DYNAMIC_BFO_0,
-			 &(bfo[0]),
-			 2 );
+
+   set_dynamic_indirect(i915,
+                        I915_DYNAMIC_BFO_0,
+                        &(bfo[0]),
+                        2);
 }
 
 const struct i915_tracked_state i915_upload_BFO = {
-   I915_NEW_DEPTH_STENCIL,
-   upload_BFO
+   "BFO",
+   upload_BFO,
+   I915_NEW_DEPTH_STENCIL
 };
 
 
+
 /***********************************************************************
  */
-
-
-static void upload_BLENDCOLOR( struct i915_context *i915 )
+static void upload_BLENDCOLOR(struct i915_context *i915)
 {
    unsigned bc[2];
 
-   memset( bc, 0, sizeof(bc) );
+   memset(bc, 0, sizeof(bc));
 
-   /* I915_NEW_BLEND {_COLOR} 
+   /* I915_NEW_BLEND
     */
    {
       const float *color = i915->blend_color.color;
 
       bc[0] = _3DSTATE_CONST_BLEND_COLOR_CMD;
-      bc[1] = pack_ui32_float4( color[0],
-				color[1],
-				color[2], 
-				color[3] );
+      bc[1] = pack_ui32_float4(color[0],
+                               color[1],
+                               color[2],
+                               color[3]);
    }
 
-   set_dynamic_indirect( i915, 
-			 I915_DYNAMIC_BC_0,
-			 bc,
-			 2 );
+   set_dynamic_indirect(i915,
+                        I915_DYNAMIC_BC_0,
+                        bc,
+                        2);
 }
 
 const struct i915_tracked_state i915_upload_BLENDCOLOR = {
-   I915_NEW_BLEND,
-   upload_BLENDCOLOR
+   "BLENDCOLOR",
+   upload_BLENDCOLOR,
+   I915_NEW_BLEND
 };
 
+
+
 /***********************************************************************
  */
-
-
-static void upload_IAB( struct i915_context *i915 )
+static void upload_IAB(struct i915_context *i915)
 {
    unsigned iab = i915->blend->iab;
 
-
-   set_dynamic_indirect( i915,
-			 I915_DYNAMIC_IAB,
-			 &iab,
-			 1 );
+   set_dynamic_indirect(i915,
+                        I915_DYNAMIC_IAB,
+                        &iab,
+                        1);
 }
 
 const struct i915_tracked_state i915_upload_IAB = {
-   I915_NEW_BLEND,
-   upload_IAB
+   "IAB",
+   upload_IAB,
+   I915_NEW_BLEND
 };
 
 
+
 /***********************************************************************
  */
-
-
-
-static void upload_DEPTHSCALE( struct i915_context *i915 )
+static void upload_DEPTHSCALE(struct i915_context *i915)
 {
-   set_dynamic_indirect( i915,
-			 I915_DYNAMIC_DEPTHSCALE_0,
-			 &(i915->rasterizer->ds[0].u),
-			 2 );
+   set_dynamic_indirect(i915,
+                        I915_DYNAMIC_DEPTHSCALE_0,
+                        &(i915->rasterizer->ds[0].u),
+                        2);
 }
 
 const struct i915_tracked_state i915_upload_DEPTHSCALE = {
-   I915_NEW_RASTERIZER,
-   upload_DEPTHSCALE
+   "DEPTHSCALE",
+   upload_DEPTHSCALE,
+   I915_NEW_RASTERIZER
 };
 
 
@@ -196,10 +199,9 @@
  * XXX: does stipple pattern need to be adjusted according to
  * the window position?
  *
- * XXX: possibly need workaround for conform paths test. 
+ * XXX: possibly need workaround for conform paths test.
  */
-
-static void upload_STIPPLE( struct i915_context *i915 )
+static void upload_STIPPLE(struct i915_context *i915)
 {
    unsigned st[2];
 
@@ -210,7 +212,6 @@
     */
    st[1] |= i915->rasterizer->st;
 
-
    /* I915_NEW_STIPPLE
     */
    {
@@ -225,73 +226,75 @@
       /* Not sure what to do about fallbacks, so for now just dont:
        */
       st[1] |= ((p[0] << 0) |
-		(p[1] << 4) |
-		(p[2] << 8) | 
-		(p[3] << 12));
+                (p[1] << 4) |
+                (p[2] << 8) |
+                (p[3] << 12));
    }
 
-
-   set_dynamic_indirect( i915, 
-			 I915_DYNAMIC_STP_0,
-			 &st[0],
-			 2 );
+   set_dynamic_indirect(i915,
+                        I915_DYNAMIC_STP_0,
+                        &st[0],
+                        2);
 }
 
-
 const struct i915_tracked_state i915_upload_STIPPLE = {
-   I915_NEW_RASTERIZER | I915_NEW_STIPPLE,
-   upload_STIPPLE
+   "STIPPLE",
+   upload_STIPPLE,
+   I915_NEW_RASTERIZER | I915_NEW_STIPPLE
 };
 
 
 
 /***********************************************************************
- * Scissor.
+ * Scissor enable
  */
 static void upload_SCISSOR_ENABLE( struct i915_context *i915 )
 {
-   set_dynamic_indirect( i915,
-			 I915_DYNAMIC_SC_ENA_0,
-			 &(i915->rasterizer->sc[0]),
-			 1 );
+   set_dynamic_indirect(i915,
+                        I915_DYNAMIC_SC_ENA_0,
+                        &(i915->rasterizer->sc[0]),
+                        1);
 }
 
 const struct i915_tracked_state i915_upload_SCISSOR_ENABLE = {
-   I915_NEW_RASTERIZER,
-   upload_SCISSOR_ENABLE
+   "SCISSOR ENABLE",
+   upload_SCISSOR_ENABLE,
+   I915_NEW_RASTERIZER
 };
 
 
 
-static void upload_SCISSOR_RECT( struct i915_context *i915 )
+/***********************************************************************
+ * Scissor rect
+ */
+static void upload_SCISSOR_RECT(struct i915_context *i915)
 {
    unsigned x1 = i915->scissor.minx;
    unsigned y1 = i915->scissor.miny;
    unsigned x2 = i915->scissor.maxx;
    unsigned y2 = i915->scissor.maxy;
    unsigned sc[3];
- 
+
    sc[0] = _3DSTATE_SCISSOR_RECT_0_CMD;
    sc[1] = (y1 << 16) | (x1 & 0xffff);
    sc[2] = (y2 << 16) | (x2 & 0xffff);
 
-   set_dynamic_indirect( i915, 
-			 I915_DYNAMIC_SC_RECT_0,
-			 &sc[0],
-			 3 );
+   set_dynamic_indirect(i915,
+                        I915_DYNAMIC_SC_RECT_0,
+                        &sc[0],
+                        3);
 }
 
-
 const struct i915_tracked_state i915_upload_SCISSOR_RECT = {
-   I915_NEW_SCISSOR,
-   upload_SCISSOR_RECT
+   "SCISSOR RECT",
+   upload_SCISSOR_RECT,
+   I915_NEW_SCISSOR
 };
 
 
 
-
-
-
+/***********************************************************************
+ */
 static const struct i915_tracked_state *atoms[] = {
    &i915_upload_MODES4,
    &i915_upload_BFO,
@@ -306,12 +309,17 @@
 /* These will be dynamic indirect state commands, but for now just end
  * up on the batch buffer with everything else.
  */
-void i915_update_dynamic( struct i915_context *i915 )
+static void update_dynamic(struct i915_context *i915)
 {
    int i;
 
    for (i = 0; i < Elements(atoms); i++)
       if (i915->dirty & atoms[i]->dirty)
-	 atoms[i]->update( i915 );
+         atoms[i]->update(i915);
 }
 
+struct i915_tracked_state i915_hw_dynamic = {
+   "dynamic",
+   update_dynamic,
+   ~0 /* all state atoms, becuase we do internal checking */
+};
diff --git a/src/gallium/drivers/i915/i915_state_emit.c b/src/gallium/drivers/i915/i915_state_emit.c
index 22082fe..bbf9ff5 100644
--- a/src/gallium/drivers/i915/i915_state_emit.c
+++ b/src/gallium/drivers/i915/i915_state_emit.c
@@ -29,6 +29,7 @@
 #include "i915_reg.h"
 #include "i915_context.h"
 #include "i915_batch.h"
+#include "i915_debug.h"
 #include "i915_reg.h"
 #include "i915_resource.h"
 
@@ -111,15 +112,20 @@
                              3
                            ) * 3/2; /* plus 50% margin */
 
-#if 0
-   debug_printf("i915_emit_hardware_state: %d dwords, %d relocs\n", dwords, relocs);
-#endif
-   
+   uintptr_t save_ptr;
+   size_t save_relocs;
+
+   if (I915_DBG_ON(DBG_ATOMS))
+      i915_dump_hardware_dirty(i915, __FUNCTION__);
+
    if(!BEGIN_BATCH(dwords, relocs)) {
       FLUSH_BATCH(NULL);
       assert(BEGIN_BATCH(dwords, relocs));
    }
 
+   save_ptr = (uintptr_t)i915->batch->ptr;
+   save_relocs = i915->batch->relocs;
+
    /* 14 dwords, 0 relocs */
    if (i915->hardware_dirty & I915_HW_INVARIENT)
    {
@@ -399,6 +405,9 @@
       OUT_BATCH(0);
    }
 
+   I915_DBG(DBG_EMIT, "%s: used %d dwords, %d relocs\n", __FUNCTION__,
+            ((uintptr_t)i915->batch->ptr - save_ptr) / 4,
+            i915->batch->relocs - save_relocs);
 
    i915->hardware_dirty = 0;
 }
diff --git a/src/gallium/drivers/i915/i915_state_immediate.c b/src/gallium/drivers/i915/i915_state_immediate.c
index 8cec699..f9ade70 100644
--- a/src/gallium/drivers/i915/i915_state_immediate.c
+++ b/src/gallium/drivers/i915/i915_state_immediate.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,13 +22,13 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
  /*
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
- 
+
 #include "i915_state_inlines.h"
 #include "i915_context.h"
 #include "i915_state.h"
@@ -46,30 +46,31 @@
 
 
 /***********************************************************************
- * S0,S1: Vertex buffer state.  
+ * S0,S1: Vertex buffer state.
  */
 static void upload_S0S1(struct i915_context *i915)
 {
    unsigned LIS0, LIS1;
 
-   /* I915_NEW_VBO */
-   /* TODO: re-use vertex buffers here? */
+   /* I915_NEW_VBO
+    */
    LIS0 = i915->vbo_offset;
 
-   /* I915_NEW_VERTEX_SIZE -- do this where the vertex size is calculated! 
+   /* I915_NEW_VERTEX_SIZE
     */
+   /* XXX do this where the vertex size is calculated! */
    {
       unsigned vertex_size = i915->current.vertex_info.size;
 
       LIS1 = ((vertex_size << 24) |
-	      (vertex_size << 16));
+              (vertex_size << 16));
    }
 
-   /* I915_NEW_VBO */
-   /* TODO: use a vertex generation number to track vbo changes */
+   /* I915_NEW_VBO
+    */
    if (1 ||
        i915->current.immediate[I915_IMMEDIATE_S0] != LIS0 ||
-       i915->current.immediate[I915_IMMEDIATE_S1] != LIS1) 
+       i915->current.immediate[I915_IMMEDIATE_S1] != LIS1)
    {
       i915->current.immediate[I915_IMMEDIATE_S0] = LIS0;
       i915->current.immediate[I915_IMMEDIATE_S1] = LIS1;
@@ -78,13 +79,13 @@
 }
 
 const struct i915_tracked_state i915_upload_S0S1 = {
-   I915_NEW_VBO | I915_NEW_VERTEX_FORMAT,
-   upload_S0S1
+   "imm S0 S1",
+   upload_S0S1,
+   I915_NEW_VBO | I915_NEW_VERTEX_FORMAT
 };
 
 
 
-
 /***********************************************************************
  * S4: Vertex format, rasterization state
  */
@@ -92,7 +93,8 @@
 {
    unsigned LIS2, LIS4;
 
-   /* I915_NEW_VERTEX_FORMAT */
+   /* I915_NEW_VERTEX_FORMAT
+    */
    {
       LIS2 = i915->current.vertex_info.hwfmt[1];
       LIS4 = i915->current.vertex_info.hwfmt[0];
@@ -113,35 +115,38 @@
    }
 }
 
-
 const struct i915_tracked_state i915_upload_S2S4 = {
-   I915_NEW_RASTERIZER | I915_NEW_VERTEX_FORMAT,
-   upload_S2S4
+   "imm S2 S4",
+   upload_S2S4,
+   I915_NEW_RASTERIZER | I915_NEW_VERTEX_FORMAT
 };
 
 
 
 /***********************************************************************
- * 
  */
-static void upload_S5( struct i915_context *i915 )
+static void upload_S5(struct i915_context *i915)
 {
    unsigned LIS5 = 0;
 
+   /* I915_NEW_DEPTH_STENCIL
+    */
    LIS5 |= i915->depth_stencil->stencil_LIS5;
    /* hope it's safe to set stencil ref value even if stencil test is disabled? */
    LIS5 |= i915->stencil_ref.ref_value[0] << S5_STENCIL_REF_SHIFT;
 
+   /* I915_NEW_BLEND
+    */
    LIS5 |= i915->blend->LIS5;
 
 #if 0
-   /* I915_NEW_RASTERIZER */
+   /* I915_NEW_RASTERIZER
+    */
    if (i915->state.Polygon->OffsetFill) {
       LIS5 |= S5_GLOBAL_DEPTH_OFFSET_ENABLE;
    }
 #endif
 
-
    if (LIS5 != i915->current.immediate[I915_IMMEDIATE_S5]) {
       i915->current.immediate[I915_IMMEDIATE_S5] = LIS5;
       i915->hardware_dirty |= I915_HW_IMMEDIATE;
@@ -149,14 +154,16 @@
 }
 
 const struct i915_tracked_state i915_upload_S5 = {
-   (I915_NEW_DEPTH_STENCIL | I915_NEW_BLEND | I915_NEW_RASTERIZER),
-   upload_S5
+   "imm S5",
+   upload_S5,
+   I915_NEW_DEPTH_STENCIL | I915_NEW_BLEND | I915_NEW_RASTERIZER
 };
 
 
+
 /***********************************************************************
  */
-static void upload_S6( struct i915_context *i915 )
+static void upload_S6(struct i915_context *i915)
 {
    unsigned LIS6 = (2 << S6_TRISTRIP_PV_SHIFT);
 
@@ -180,14 +187,16 @@
 }
 
 const struct i915_tracked_state i915_upload_S6 = {
-   I915_NEW_BLEND | I915_NEW_DEPTH_STENCIL | I915_NEW_FRAMEBUFFER,
-   upload_S6
+   "imm s6",
+   upload_S6,
+   I915_NEW_BLEND | I915_NEW_DEPTH_STENCIL | I915_NEW_FRAMEBUFFER
 };
 
 
+
 /***********************************************************************
  */
-static void upload_S7( struct i915_context *i915 )
+static void upload_S7(struct i915_context *i915)
 {
    unsigned LIS7;
 
@@ -202,11 +211,15 @@
 }
 
 const struct i915_tracked_state i915_upload_S7 = {
-   I915_NEW_RASTERIZER,
-   upload_S7
+   "imm S7",
+   upload_S7,
+   I915_NEW_RASTERIZER
 };
 
 
+
+/***********************************************************************
+ */
 static const struct i915_tracked_state *atoms[] = {
    &i915_upload_S0S1,
    &i915_upload_S2S4,
@@ -215,13 +228,17 @@
    &i915_upload_S7
 };
 
-/* 
- */
-void i915_update_immediate( struct i915_context *i915 )
+static void update_immediate(struct i915_context *i915)
 {
    int i;
 
    for (i = 0; i < Elements(atoms); i++)
       if (i915->dirty & atoms[i]->dirty)
-	 atoms[i]->update( i915 );
+         atoms[i]->update(i915);
 }
+
+struct i915_tracked_state i915_hw_immediate = {
+   "immediate",
+   update_immediate,
+   ~0 /* all state atoms, becuase we do internal checking */
+};
diff --git a/src/gallium/drivers/i915/i915_state_sampler.c b/src/gallium/drivers/i915/i915_state_sampler.c
index 77b9bcc..941259e 100644
--- a/src/gallium/drivers/i915/i915_state_sampler.c
+++ b/src/gallium/drivers/i915/i915_state_sampler.c
@@ -53,17 +53,23 @@
  *
  * So we need to update the map state when we change samplers and
  * we need to be change the sampler state when map state is changed.
- * The first part is done by calling i915_update_texture in
- * i915_update_samplers and the second part is done else where in
- * code tracking the state changes.
+ * The first part is done by calling update_texture in update_samplers
+ * and the second part is done else where in code tracking the state
+ * changes.
  */
 
-static void
-i915_update_texture(struct i915_context *i915,
-                    uint unit,
-                    const struct i915_texture *tex,
-                    const struct i915_sampler_state *sampler,
-                    uint state[6]);
+static void update_texture(struct i915_context *i915,
+                           uint unit,
+                           const struct i915_texture *tex,
+                           const struct i915_sampler_state *sampler,
+                           uint state[6]);
+
+
+
+/***********************************************************************
+ * Samplers
+ */
+
 /**
  * Compute i915 texture sampling state.
  *
@@ -74,16 +80,13 @@
  */
 static void update_sampler(struct i915_context *i915,
                            uint unit,
-			   const struct i915_sampler_state *sampler,
-			   const struct i915_texture *tex,
-			   unsigned state[3] )
+                           const struct i915_sampler_state *sampler,
+                           const struct i915_texture *tex,
+                           unsigned state[3])
 {
    const struct pipe_resource *pt = &tex->b.b;
    unsigned minlod, lastlod;
 
-   /* Need to do this after updating the maps, which call the
-    * intel_finalize_mipmap_tree and hence can update firstLevel:
-    */
    state[0] = sampler->state[0];
    state[1] = sampler->state[1];
    state[2] = sampler->state[2];
@@ -118,7 +121,7 @@
            wr == PIPE_TEX_WRAP_CLAMP_TO_BORDER)) {
          if (i915->conformance_mode > 0) {
             assert(0);
-            /* 	    sampler->fallback = true; */
+            /*             sampler->fallback = true; */
             /* TODO */
          }
       }
@@ -137,8 +140,7 @@
    state[1] |= (unit << SS3_TEXTUREMAP_INDEX_SHIFT);
 }
 
-
-void i915_update_samplers( struct i915_context *i915 )
+static void update_samplers(struct i915_context *i915)
 {
    uint unit;
 
@@ -152,29 +154,38 @@
       if (i915->fragment_sampler_views[unit]) {
          struct i915_texture *texture = i915_texture(i915->fragment_sampler_views[unit]->texture);
 
-	 update_sampler( i915,
-	                 unit,
-	                 i915->sampler[unit],       /* sampler state */
-	                 texture,                    /* texture */
-	                 i915->current.sampler[unit] /* the result */
-	                 );
-	 i915_update_texture( i915,
-	                      unit,
-	                      texture,                      /* texture */
-	                      i915->sampler[unit],          /* sampler state */
-	                      i915->current.texbuffer[unit] );
+         update_sampler(i915,
+                        unit,
+                        i915->sampler[unit],          /* sampler state */
+                        texture,                      /* texture */
+                        i915->current.sampler[unit]); /* the result */
+         update_texture(i915,
+                        unit,
+                        texture,                        /* texture */
+                        i915->sampler[unit],            /* sampler state */
+                        i915->current.texbuffer[unit]); /* the result */
 
-	 i915->current.sampler_enable_nr++;
-	 i915->current.sampler_enable_flags |= (1 << unit);
+         i915->current.sampler_enable_nr++;
+         i915->current.sampler_enable_flags |= (1 << unit);
       }
    }
 
    i915->hardware_dirty |= I915_HW_SAMPLER | I915_HW_MAP;
 }
 
+struct i915_tracked_state i915_hw_samplers = {
+   "sampler_views",
+   update_samplers,
+   I915_NEW_SAMPLER | I915_NEW_SAMPLER_VIEW
+};
 
-static uint
-translate_texture_format(enum pipe_format pipeFormat)
+
+
+/***********************************************************************
+ * Sampler views
+ */
+
+static uint translate_texture_format(enum pipe_format pipeFormat)
 {
    switch (pipeFormat) {
    case PIPE_FORMAT_L8_UNORM:
@@ -226,19 +237,17 @@
       return (MAPSURF_32BIT | MT_32BIT_xI824);
    default:
       debug_printf("i915: translate_texture_format() bad image format %x\n",
-              pipeFormat);
+                   pipeFormat);
       assert(0);
       return 0;
    }
 }
 
-
-static void
-i915_update_texture(struct i915_context *i915,
-                    uint unit,
-                    const struct i915_texture *tex,
-                    const struct i915_sampler_state *sampler,
-                    uint state[6])
+static void update_texture(struct i915_context *i915,
+                           uint unit,
+                           const struct i915_texture *tex,
+                           const struct i915_sampler_state *sampler,
+                           uint state[6])
 {
    const struct pipe_resource *pt = &tex->b.b;
    uint format, pitch;
@@ -287,9 +296,7 @@
        | ((depth - 1) << MS4_VOLUME_DEPTH_SHIFT));
 }
 
-
-void
-i915_update_textures(struct i915_context *i915)
+static void update_textures(struct i915_context *i915)
 {
    uint unit;
 
@@ -300,13 +307,19 @@
       if (i915->fragment_sampler_views[unit]) {
          struct i915_texture *texture = i915_texture(i915->fragment_sampler_views[unit]->texture);
 
-	 i915_update_texture( i915,
-	                      unit,
-	                      texture,                      /* texture */
-	                      i915->sampler[unit],          /* sampler state */
-	                      i915->current.texbuffer[unit] );
+         update_texture(i915,
+                        unit,
+                        texture,                      /* texture */
+                        i915->sampler[unit],          /* sampler state */
+                        i915->current.texbuffer[unit]);
       }
    }
 
    i915->hardware_dirty |= I915_HW_MAP;
 }
+
+struct i915_tracked_state i915_hw_sampler_views = {
+   "sampler_views",
+   update_textures,
+   I915_NEW_SAMPLER_VIEW
+};
diff --git a/src/gallium/drivers/i965/SConscript b/src/gallium/drivers/i965/SConscript
index 019af68..119f914 100644
--- a/src/gallium/drivers/i965/SConscript
+++ b/src/gallium/drivers/i965/SConscript
@@ -2,6 +2,10 @@
 
 env = env.Clone()
 
+if msvc:
+	print 'warning: not building i965g'
+	Return();
+
 i965 = env.ConvenienceLibrary(
 	target = 'i965',
 	source = [
diff --git a/src/gallium/drivers/i965/brw_disasm.c b/src/gallium/drivers/i965/brw_disasm.c
index 4c85793..28c8351 100644
--- a/src/gallium/drivers/i965/brw_disasm.c
+++ b/src/gallium/drivers/i965/brw_disasm.c
@@ -239,7 +239,7 @@
     [2] = "UW",
     [3] = "W",
     [5] = "VF",
-    [5] = "V",
+    [6] = "V",
     [7] = "F"
 };
 
diff --git a/src/gallium/drivers/identity/id_objects.c b/src/gallium/drivers/identity/id_objects.c
index ca4743f..593928f 100644
--- a/src/gallium/drivers/identity/id_objects.c
+++ b/src/gallium/drivers/identity/id_objects.c
@@ -120,13 +120,14 @@
 
    assert(view->texture == id_resource->resource);
 
-   id_view = MALLOC(sizeof(struct identity_sampler_view));
+   id_view = CALLOC_STRUCT(identity_sampler_view);
 
    id_view->base = *view;
    id_view->base.reference.count = 1;
    id_view->base.texture = NULL;
    pipe_resource_reference(&id_view->base.texture, id_resource->resource);
    id_view->base.context = id_context->pipe;
+   id_view->sampler_view = view;
 
    return &id_view->base;
 error:
@@ -180,8 +181,8 @@
                           struct identity_transfer *id_transfer)
 {
    pipe_resource_reference(&id_transfer->base.resource, NULL);
-   id_transfer->pipe->transfer_destroy(id_context->pipe,
-                                       id_transfer->transfer);
+   id_context->pipe->transfer_destroy(id_context->pipe,
+                                      id_transfer->transfer);
    FREE(id_transfer);
 }
 
diff --git a/src/gallium/drivers/identity/id_objects.h b/src/gallium/drivers/identity/id_objects.h
index 5eea10b..e8deabf 100644
--- a/src/gallium/drivers/identity/id_objects.h
+++ b/src/gallium/drivers/identity/id_objects.h
@@ -65,7 +65,6 @@
 {
    struct pipe_transfer base;
 
-   struct pipe_context *pipe;
    struct pipe_transfer *transfer;
 };
 
diff --git a/src/gallium/drivers/llvmpipe/.gitignore b/src/gallium/drivers/llvmpipe/.gitignore
index a1b6f56..4e0d4c3 100644
--- a/src/gallium/drivers/llvmpipe/.gitignore
+++ b/src/gallium/drivers/llvmpipe/.gitignore
@@ -3,3 +3,4 @@
 lp_test_conv
 lp_test_format
 lp_test_printf
+lp_test_sincos
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index c79c8bd..ee28179 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -37,6 +37,7 @@
 	lp_state_gs.c \
 	lp_state_rasterizer.c \
 	lp_state_sampler.c \
+        lp_state_so.c \
 	lp_state_surface.c \
 	lp_state_vertex.c \
 	lp_state_vs.c \
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index a064669..a1ef71d 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -57,6 +57,7 @@
 		'lp_state_gs.c',
 		'lp_state_rasterizer.c',
 		'lp_state_sampler.c',
+                'lp_state_so.c',
 		'lp_state_surface.c',
 		'lp_state_vertex.c',
 		'lp_state_vs.c',
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 9e88a6e..3db4f12 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -36,6 +36,7 @@
 #include "util/u_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_simple_list.h"
 #include "lp_clear.h"
 #include "lp_context.h"
 #include "lp_flush.h"
@@ -94,6 +95,8 @@
 
    memset(llvmpipe, 0, sizeof *llvmpipe);
 
+   make_empty_list(&llvmpipe->fs_variants_list);
+
    llvmpipe->pipe.winsys = screen->winsys;
    llvmpipe->pipe.screen = screen;
    llvmpipe->pipe.priv = priv;
@@ -110,6 +113,7 @@
    llvmpipe_init_sampler_funcs(llvmpipe);
    llvmpipe_init_query_funcs( llvmpipe );
    llvmpipe_init_vertex_funcs(llvmpipe);
+   llvmpipe_init_so_funcs(llvmpipe);
    llvmpipe_init_fs_funcs(llvmpipe);
    llvmpipe_init_vs_funcs(llvmpipe);
    llvmpipe_init_gs_funcs(llvmpipe);
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index cb04d4a..986e604 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -38,6 +38,7 @@
 #include "lp_tex_sample.h"
 #include "lp_jit.h"
 #include "lp_setup.h"
+#include "lp_state_fs.h"
 
 
 struct llvmpipe_vbuf_render;
@@ -62,6 +63,7 @@
    const struct lp_vertex_shader *vs;
    const struct lp_geometry_shader *gs;
    const struct lp_velems_state *velems;
+   const struct lp_so_state *so;
 
    /** Other rendering state */
    struct pipe_blend_color blend_color;
@@ -75,6 +77,12 @@
    struct pipe_sampler_view *vertex_sampler_views[PIPE_MAX_VERTEX_SAMPLERS];
    struct pipe_viewport_state viewport;
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   struct {
+      struct llvmpipe_resource *buffer[PIPE_MAX_SO_BUFFERS];
+      int offset[PIPE_MAX_SO_BUFFERS];
+      int so_count[PIPE_MAX_SO_BUFFERS];
+      int num_buffers;
+   } so_target;
 
    unsigned num_samplers;
    unsigned num_fragment_sampler_views;
@@ -105,6 +113,8 @@
    unsigned tex_timestamp;
    boolean no_rast;
 
+   struct lp_fs_variant_list_item fs_variants_list;
+   unsigned nr_fs_variants;
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_limits.h b/src/gallium/drivers/llvmpipe/lp_limits.h
index 4102a9d..d1c4314 100644
--- a/src/gallium/drivers/llvmpipe/lp_limits.h
+++ b/src/gallium/drivers/llvmpipe/lp_limits.h
@@ -66,5 +66,10 @@
  */
 #define LP_MAX_SCENE_SIZE (512 * 1024 * 1024)
 
+/**
+ * Max number of shader variants (for all shaders combined,
+ * per context) that will be kept around.
+ */
+#define LP_MAX_SHADER_VARIANTS 1024
 
 #endif /* LP_LIMITS_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
index 3f7a85b..05d1b93 100644
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -54,6 +54,9 @@
 #define LP_NEW_QUERY         0x4000
 #define LP_NEW_BLEND_COLOR   0x8000
 #define LP_NEW_GS            0x10000
+#define LP_NEW_SO            0x20000
+#define LP_NEW_SO_BUFFERS    0x40000
+
 
 
 struct vertex_info;
@@ -82,6 +85,10 @@
    struct pipe_vertex_element velem[PIPE_MAX_ATTRIBS];
 };
 
+struct lp_so_state {
+   struct pipe_stream_output_state base;
+};
+
 
 void
 llvmpipe_set_framebuffer_state(struct pipe_context *,
@@ -120,5 +127,9 @@
 void
 llvmpipe_init_rasterizer_funcs(struct llvmpipe_context *llvmpipe);
 
+void
+llvmpipe_init_so_funcs(struct llvmpipe_context *llvmpipe);
+
+
 
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 2619e04..6511505 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -68,6 +68,7 @@
 #include "util/u_format.h"
 #include "util/u_dump.h"
 #include "util/u_string.h"
+#include "util/u_simple_list.h"
 #include "os/os_time.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
@@ -95,6 +96,7 @@
 #include "lp_setup.h"
 #include "lp_state.h"
 #include "lp_tex_sample.h"
+#include "lp_flush.h"
 
 
 #include <llvm-c/Analysis.h>
@@ -936,7 +938,10 @@
    if(!variant)
       return NULL;
 
-   variant->no = shader->variant_no++;
+   variant->shader = shader;
+   variant->list_item_global.base = variant;
+   variant->list_item_local.base = variant;
+   variant->no = shader->variants_created++;
 
    memcpy(&variant->key, key, sizeof *key);
 
@@ -962,10 +967,6 @@
          !shader->info.uses_kill
          ? TRUE : FALSE;
 
-   /* insert new variant into linked list */
-   variant->next = shader->variants;
-   shader->variants = variant;
-
    return variant;
 }
 
@@ -981,6 +982,7 @@
       return NULL;
 
    shader->no = fs_no++;
+   make_empty_list(&shader->variants);
 
    /* get/save the summary info for this shader */
    tgsi_scan_shader(templ->tokens, &shader->info);
@@ -1024,14 +1026,40 @@
    llvmpipe->dirty |= LP_NEW_FS;
 }
 
+static void
+remove_shader_variant(struct llvmpipe_context *lp,
+                      struct lp_fragment_shader_variant *variant)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
+   unsigned i;
+
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      debug_printf("llvmpipe: del fs #%u var #%u v created #%u v cached #%u v total cached #%u\n",
+                    variant->shader->no, variant->no, variant->shader->variants_created,
+                    variant->shader->variants_cached, lp->nr_fs_variants);
+   }
+   for (i = 0; i < Elements(variant->function); i++) {
+      if (variant->function[i]) {
+         if (variant->jit_function[i])
+            LLVMFreeMachineCodeForFunction(screen->engine,
+                                           variant->function[i]);
+         LLVMDeleteFunction(variant->function[i]);
+      }
+   }
+   remove_from_list(&variant->list_item_local);
+   variant->shader->variants_cached--;
+   remove_from_list(&variant->list_item_global);
+   lp->nr_fs_variants--;
+   FREE(variant);
+}
 
 static void
 llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
-   struct llvmpipe_screen *screen = llvmpipe_screen(pipe->screen);
+   struct pipe_fence_handle *fence = NULL;
    struct lp_fragment_shader *shader = fs;
-   struct lp_fragment_shader_variant *variant;
+   struct lp_fs_variant_list_item *li;
 
    assert(fs != llvmpipe->fs);
    (void) llvmpipe;
@@ -1039,29 +1067,24 @@
    /*
     * XXX: we need to flush the context until we have some sort of reference
     * counting in fragment shaders as they may still be binned
+    * Flushing alone might not sufficient we need to wait on it too.
     */
-   draw_flush(llvmpipe->draw);
-   lp_setup_flush(llvmpipe->setup, 0);
 
-   variant = shader->variants;
-   while(variant) {
-      struct lp_fragment_shader_variant *next = variant->next;
-      unsigned i;
+   llvmpipe_flush(pipe, 0, &fence);
 
-      for (i = 0; i < Elements(variant->function); i++) {
-         if (variant->function[i]) {
-            if (variant->jit_function[i])
-               LLVMFreeMachineCodeForFunction(screen->engine,
-                                              variant->function[i]);
-            LLVMDeleteFunction(variant->function[i]);
-         }
-      }
-
-      FREE(variant);
-
-      variant = next;
+   if (fence) {
+      pipe->screen->fence_finish(pipe->screen, fence, 0);
+      pipe->screen->fence_reference(pipe->screen, &fence, NULL);
    }
 
+   li = first_elem(&shader->variants);
+   while(!at_end(&shader->variants, li)) {
+      struct lp_fs_variant_list_item *next = next_elem(li);
+      remove_shader_variant(llvmpipe, li->base);
+      li = next;
+   }
+
+   assert(shader->variants_cached == 0);
    FREE((void *) shader->base.tokens);
    FREE(shader);
 }
@@ -1088,9 +1111,10 @@
    /* note: reference counting */
    pipe_resource_reference(&llvmpipe->constants[shader][index], constants);
 
-   if(shader == PIPE_SHADER_VERTEX) {
-      draw_set_mapped_constant_buffer(llvmpipe->draw, PIPE_SHADER_VERTEX, index,
-                                      data, size);
+   if(shader == PIPE_SHADER_VERTEX ||
+      shader == PIPE_SHADER_GEOMETRY) {
+      draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
+                                      index, data, size);
    }
 
    llvmpipe->dirty |= LP_NEW_CONSTANTS;
@@ -1215,7 +1239,6 @@
          lp_sampler_static_state(&key->sampler[i], lp->fragment_sampler_views[i], lp->sampler[i]);
 }
 
-
 /**
  * Update fragment state.  This is called just prior to drawing
  * something when some fragment-related state has changed.
@@ -1225,21 +1248,47 @@
 {
    struct lp_fragment_shader *shader = lp->fs;
    struct lp_fragment_shader_variant_key key;
-   struct lp_fragment_shader_variant *variant;
+   struct lp_fragment_shader_variant *variant = NULL;
+   struct lp_fs_variant_list_item *li;
 
    make_variant_key(lp, shader, &key);
 
-   variant = shader->variants;
-   while(variant) {
-      if(memcmp(&variant->key, &key, sizeof key) == 0)
+   li = first_elem(&shader->variants);
+   while(!at_end(&shader->variants, li)) {
+      if(memcmp(&li->base->key, &key, sizeof key) == 0) {
+         variant = li->base;
          break;
-
-      variant = variant->next;
+      }
+      li = next_elem(li);
    }
 
-   if (!variant) {
+   if (variant) {
+      move_to_head(&lp->fs_variants_list, &variant->list_item_global);
+   }
+   else {
       int64_t t0, t1;
       int64_t dt;
+      unsigned i;
+      if (lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS) {
+         struct pipe_context *pipe = &lp->pipe;
+         struct pipe_fence_handle *fence = NULL;
+
+         /*
+          * XXX: we need to flush the context until we have some sort of reference
+          * counting in fragment shaders as they may still be binned
+          * Flushing alone might not be sufficient we need to wait on it too.
+          */
+         llvmpipe_flush(pipe, 0, &fence);
+
+         if (fence) {
+            pipe->screen->fence_finish(pipe->screen, fence, 0);
+            pipe->screen->fence_reference(pipe->screen, &fence, NULL);
+         }
+         for (i = 0; i < LP_MAX_SHADER_VARIANTS / 4; i++) {
+            struct lp_fs_variant_list_item *item = last_elem(&lp->fs_variants_list);
+            remove_shader_variant(lp, item->base);
+         }
+      }
       t0 = os_time_get();
 
       variant = generate_variant(lp, shader, &key);
@@ -1248,6 +1297,13 @@
       dt = t1 - t0;
       LP_COUNT_ADD(llvm_compile_time, dt);
       LP_COUNT_ADD(nr_llvm_compiles, 2);  /* emit vs. omit in/out test */
+
+      if (variant) {
+         insert_at_head(&shader->variants, &variant->list_item_local);
+         insert_at_head(&lp->fs_variants_list, &variant->list_item_global);
+         lp->nr_fs_variants++;
+         shader->variants_cached++;
+      }
    }
 
    lp_setup_set_fs_variant(lp->setup, variant);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h
index 64ead2a..593cd4d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -64,6 +64,11 @@
    struct lp_sampler_static_state sampler[PIPE_MAX_SAMPLERS];
 };
 
+struct lp_fs_variant_list_item
+{
+   struct lp_fragment_shader_variant *base;
+   struct lp_fs_variant_list_item *next, *prev;
+};
 
 struct lp_fragment_shader_variant
 {
@@ -75,7 +80,8 @@
 
    lp_jit_frag_func jit_function[2];
 
-   struct lp_fragment_shader_variant *next;
+   struct lp_fs_variant_list_item list_item_global, list_item_local;
+   struct lp_fragment_shader *shader;
 
    /* For debugging/profiling purposes */
    unsigned no;
@@ -89,11 +95,12 @@
 
    struct tgsi_shader_info info;
 
-   struct lp_fragment_shader_variant *variants;
+   struct lp_fs_variant_list_item variants;
 
    /* For debugging/profiling purposes */
    unsigned no;
-   unsigned variant_no;
+   unsigned variants_created;
+   unsigned variants_cached;
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 55d4336..e94065f 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -105,6 +105,13 @@
 
 
 static void
+llvmpipe_bind_geometry_sampler_states(struct pipe_context *pipe,
+                                      unsigned num, void **sampler)
+{
+   /* XXX: implementation missing */
+}
+
+static void
 llvmpipe_set_fragment_sampler_views(struct pipe_context *pipe,
                                     unsigned num,
                                     struct pipe_sampler_view **views)
@@ -163,6 +170,14 @@
 }
 
 
+static void
+llvmpipe_set_geometry_sampler_views(struct pipe_context *pipe,
+                                    unsigned num,
+                                    struct pipe_sampler_view **views)
+{
+   /*XXX: implementation missing */
+}
+
 static struct pipe_sampler_view *
 llvmpipe_create_sampler_view(struct pipe_context *pipe,
                             struct pipe_resource *texture,
@@ -206,8 +221,10 @@
 
    llvmpipe->pipe.bind_fragment_sampler_states  = llvmpipe_bind_sampler_states;
    llvmpipe->pipe.bind_vertex_sampler_states  = llvmpipe_bind_vertex_sampler_states;
+   llvmpipe->pipe.bind_geometry_sampler_states  = llvmpipe_bind_geometry_sampler_states;
    llvmpipe->pipe.set_fragment_sampler_views = llvmpipe_set_fragment_sampler_views;
    llvmpipe->pipe.set_vertex_sampler_views = llvmpipe_set_vertex_sampler_views;
+   llvmpipe->pipe.set_geometry_sampler_views = llvmpipe_set_geometry_sampler_views;
    llvmpipe->pipe.create_sampler_view = llvmpipe_create_sampler_view;
    llvmpipe->pipe.sampler_view_destroy = llvmpipe_sampler_view_destroy;
    llvmpipe->pipe.delete_sampler_state = llvmpipe_delete_sampler_state;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_so.c b/src/gallium/drivers/llvmpipe/lp_state_so.c
new file mode 100644
index 0000000..30b17c9
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_so.c
@@ -0,0 +1,137 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_texture.h"
+
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+
+
+static void *
+llvmpipe_create_stream_output_state(struct pipe_context *pipe,
+                                    const struct pipe_stream_output_state *templ)
+{
+   struct lp_so_state *so;
+   so = (struct lp_so_state *) CALLOC_STRUCT(lp_so_state);
+
+   if (so) {
+      so->base.num_outputs = templ->num_outputs;
+      so->base.stride = templ->stride;
+      memcpy(so->base.output_buffer,
+             templ->output_buffer,
+             sizeof(int) * templ->num_outputs);
+      memcpy(so->base.register_index,
+             templ->register_index,
+             sizeof(int) * templ->num_outputs);
+      memcpy(so->base.register_mask,
+             templ->register_mask,
+             sizeof(ubyte) * templ->num_outputs);
+   }
+   return so;
+}
+
+static void
+llvmpipe_bind_stream_output_state(struct pipe_context *pipe,
+                                  void *so)
+{
+   struct llvmpipe_context *lp = llvmpipe_context(pipe);
+   struct lp_so_state *lp_so = (struct lp_so_state *) so;
+
+   lp->so = lp_so;
+
+   lp->dirty |= LP_NEW_SO;
+
+   if (lp_so)
+      draw_set_so_state(lp->draw, &lp_so->base);
+}
+
+static void
+llvmpipe_delete_stream_output_state(struct pipe_context *pipe, void *so)
+{
+   FREE( so );
+}
+
+static void
+llvmpipe_set_stream_output_buffers(struct pipe_context *pipe,
+                                   struct pipe_resource **buffers,
+                                   int *offsets,
+                                   int num_buffers)
+{
+   struct llvmpipe_context *lp = llvmpipe_context(pipe);
+   int i;
+   void *map_buffers[PIPE_MAX_SO_BUFFERS];
+
+   assert(num_buffers <= PIPE_MAX_SO_BUFFERS);
+   if (num_buffers > PIPE_MAX_SO_BUFFERS)
+      num_buffers = PIPE_MAX_SO_BUFFERS;
+
+   lp->dirty |= LP_NEW_SO_BUFFERS;
+
+   for (i = 0; i < num_buffers; ++i) {
+      void *mapped;
+      struct llvmpipe_resource *res = llvmpipe_resource(buffers[i]);
+
+      if (!res) {
+         /* the whole call is invalid, bail out */
+         lp->so_target.num_buffers = 0;
+         draw_set_mapped_so_buffers(lp->draw, 0, 0);
+         return;
+      }
+
+      lp->so_target.buffer[i] = res;
+      lp->so_target.offset[i] = offsets[i];
+      lp->so_target.so_count[i] = 0;
+
+      mapped = res->data;
+      if (offsets[i] >= 0)
+         map_buffers[i] = ((char*)mapped) + offsets[i];
+      else {
+         /* this is a buffer append */
+         assert(!"appending not implemented");
+         map_buffers[i] = mapped;
+      }
+   }
+   lp->so_target.num_buffers = num_buffers;
+
+   draw_set_mapped_so_buffers(lp->draw, map_buffers, num_buffers);
+}
+
+void
+llvmpipe_init_so_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_stream_output_state =
+      llvmpipe_create_stream_output_state;
+   llvmpipe->pipe.bind_stream_output_state =
+      llvmpipe_bind_stream_output_state;
+   llvmpipe->pipe.delete_stream_output_state =
+      llvmpipe_delete_stream_output_state;
+
+   llvmpipe->pipe.set_stream_output_buffers =
+      llvmpipe_set_stream_output_buffers;
+}
diff --git a/src/gallium/drivers/r300/Makefile b/src/gallium/drivers/r300/Makefile
index 6bb82e5..dd897f6 100644
--- a/src/gallium/drivers/r300/Makefile
+++ b/src/gallium/drivers/r300/Makefile
@@ -14,6 +14,8 @@
 	r300_hyperz.c \
 	r300_query.c \
 	r300_render.c \
+	r300_render_stencilref.c \
+	r300_render_translate.c \
 	r300_resource.c \
 	r300_screen.c \
 	r300_screen_buffer.c \
diff --git a/src/gallium/drivers/r300/SConscript b/src/gallium/drivers/r300/SConscript
index eb3e1d3..ee19e9d 100644
--- a/src/gallium/drivers/r300/SConscript
+++ b/src/gallium/drivers/r300/SConscript
@@ -24,6 +24,8 @@
         'r300_hyperz.c',
         'r300_query.c',
         'r300_render.c',
+        'r300_render_stencilref.c',
+        'r300_render_translate.c',
         'r300_resource.c',
         'r300_screen.c',
         'r300_screen_buffer.c',
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index cc64fc3..389354c 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -25,8 +25,23 @@
 
 #include "util/u_format.h"
 
-static void r300_blitter_save_states(struct r300_context* r300)
+enum r300_blitter_op
 {
+    R300_CLEAR,
+    R300_CLEAR_SURFACE,
+    R300_COPY
+};
+
+static void r300_blitter_begin(struct r300_context* r300, enum r300_blitter_op op)
+{
+    if (r300->query_current) {
+        r300->blitter_saved_query = r300->query_current;
+        r300_stop_query(r300);
+    }
+
+    /* Yeah we have to save all those states to ensure the blitter operation
+     * is really transparent. The states will be restored by the blitter once
+     * copying is done. */
     util_blitter_save_blend(r300->blitter, r300->blend_state.state);
     util_blitter_save_depth_stencil_alpha(r300->blitter, r300->dsa_state.state);
     util_blitter_save_stencil_ref(r300->blitter, &(r300->stencil_ref));
@@ -34,10 +49,34 @@
     util_blitter_save_fragment_shader(r300->blitter, r300->fs.state);
     util_blitter_save_vertex_shader(r300->blitter, r300->vs_state.state);
     util_blitter_save_viewport(r300->blitter, &r300->viewport);
-    util_blitter_save_clip(r300->blitter, &r300->clip);
+    util_blitter_save_clip(r300->blitter, (struct pipe_clip_state*)r300->clip_state.state);
     util_blitter_save_vertex_elements(r300->blitter, r300->velems);
     util_blitter_save_vertex_buffers(r300->blitter, r300->vertex_buffer_count,
                                      r300->vertex_buffer);
+
+    if (op & (R300_CLEAR_SURFACE | R300_COPY))
+        util_blitter_save_framebuffer(r300->blitter, r300->fb_state.state);
+
+    if (op & R300_COPY) {
+        struct r300_textures_state* state =
+            (struct r300_textures_state*)r300->textures_state.state;
+
+        util_blitter_save_fragment_sampler_states(
+            r300->blitter, state->sampler_state_count,
+            (void**)state->sampler_states);
+
+        util_blitter_save_fragment_sampler_views(
+            r300->blitter, state->sampler_view_count,
+            (struct pipe_sampler_view**)state->sampler_views);
+    }
+}
+
+static void r300_blitter_end(struct r300_context *r300)
+{
+    if (r300->blitter_saved_query) {
+        r300_resume_query(r300, r300->blitter_saved_query);
+        r300->blitter_saved_query = NULL;
+    }
 }
 
 /* Clear currently bound buffers. */
@@ -73,13 +112,45 @@
     struct pipe_framebuffer_state* fb =
         (struct pipe_framebuffer_state*)r300->fb_state.state;
 
-    r300_blitter_save_states(r300);
-
+    r300_blitter_begin(r300, R300_CLEAR);
     util_blitter_clear(r300->blitter,
                        fb->width,
                        fb->height,
                        fb->nr_cbufs,
                        buffers, rgba, depth, stencil);
+    r300_blitter_end(r300);
+}
+
+/* Clear a region of a color surface to a constant value. */
+static void r300_clear_render_target(struct pipe_context *pipe,
+                                     struct pipe_surface *dst,
+                                     const float *rgba,
+                                     unsigned dstx, unsigned dsty,
+                                     unsigned width, unsigned height)
+{
+    struct r300_context *r300 = r300_context(pipe);
+
+    r300_blitter_begin(r300, R300_CLEAR_SURFACE);
+    util_blitter_clear_render_target(r300->blitter, dst, rgba,
+                                     dstx, dsty, width, height);
+    r300_blitter_end(r300);
+}
+
+/* Clear a region of a depth stencil surface. */
+static void r300_clear_depth_stencil(struct pipe_context *pipe,
+                                     struct pipe_surface *dst,
+                                     unsigned clear_flags,
+                                     double depth,
+                                     unsigned stencil,
+                                     unsigned dstx, unsigned dsty,
+                                     unsigned width, unsigned height)
+{
+    struct r300_context *r300 = r300_context(pipe);
+
+    r300_blitter_begin(r300, R300_CLEAR_SURFACE);
+    util_blitter_clear_depth_stencil(r300->blitter, dst, clear_flags, depth, stencil,
+                                     dstx, dsty, width, height);
+    r300_blitter_end(r300);
 }
 
 /* Copy a block of pixels from one surface to another using HW. */
@@ -93,27 +164,12 @@
                                 unsigned width, unsigned height)
 {
     struct r300_context* r300 = r300_context(pipe);
-    struct r300_textures_state* state =
-        (struct r300_textures_state*)r300->textures_state.state;
 
-    /* Yeah we have to save all those states to ensure this blitter operation
-     * is really transparent. The states will be restored by the blitter once
-     * copying is done. */
-    r300_blitter_save_states(r300);
-    util_blitter_save_framebuffer(r300->blitter, r300->fb_state.state);
-
-    util_blitter_save_fragment_sampler_states(
-        r300->blitter, state->sampler_state_count,
-        (void**)state->sampler_states);
-
-    util_blitter_save_fragment_sampler_views(
-        r300->blitter, state->sampler_view_count,
-        (struct pipe_sampler_view**)state->sampler_views);
-
-    /* Do a copy */
+    r300_blitter_begin(r300, R300_COPY);
     util_blitter_copy_region(r300->blitter, dst, subdst, dstx, dsty, dstz,
                              src, subsrc, srcx, srcy, srcz, width, height,
                              TRUE);
+    r300_blitter_end(r300);
 }
 
 /* Copy a block of pixels from one surface to another. */
@@ -129,14 +185,6 @@
     enum pipe_format old_format = dst->format;
     enum pipe_format new_format = old_format;
 
-    if (dst->format != src->format) {
-        debug_printf("r300: Implementation error: Format mismatch in %s\n"
-            "    : src: %s dst: %s\n", __FUNCTION__,
-            util_format_short_name(src->format),
-            util_format_short_name(dst->format));
-        debug_assert(0);
-    }
-
     if (!pipe->screen->is_format_supported(pipe->screen,
                                            old_format, src->target,
                                            src->nr_samples,
@@ -187,40 +235,6 @@
     }
 }
 
-/* Clear a region of a color surface to a constant value. */
-static void r300_clear_render_target(struct pipe_context *pipe,
-                                     struct pipe_surface *dst,
-                                     const float *rgba,
-                                     unsigned dstx, unsigned dsty,
-                                     unsigned width, unsigned height)
-{
-    struct r300_context *r300 = r300_context(pipe);
-
-    r300_blitter_save_states(r300);
-    util_blitter_save_framebuffer(r300->blitter, r300->fb_state.state);
-
-    util_blitter_clear_render_target(r300->blitter, dst, rgba,
-                                     dstx, dsty, width, height);
-}
-
-/* Clear a region of a depth stencil surface. */
-static void r300_clear_depth_stencil(struct pipe_context *pipe,
-                                     struct pipe_surface *dst,
-                                     unsigned clear_flags,
-                                     double depth,
-                                     unsigned stencil,
-                                     unsigned dstx, unsigned dsty,
-                                     unsigned width, unsigned height)
-{
-    struct r300_context *r300 = r300_context(pipe);
-
-    r300_blitter_save_states(r300);
-    util_blitter_save_framebuffer(r300->blitter, r300->fb_state.state);
-
-    util_blitter_clear_depth_stencil(r300->blitter, dst, clear_flags, depth, stencil,
-                                     dstx, dsty, width, height);
-}
-
 void r300_init_blit_functions(struct r300_context *r300)
 {
     r300->context.clear = r300_clear;
diff --git a/src/gallium/drivers/r300/r300_cb.h b/src/gallium/drivers/r300/r300_cb.h
new file mode 100644
index 0000000..6987471
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_cb.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/**
+ * This file contains macros for building command buffers in memory.
+ *
+ * Use NEW_CB for buffers with a varying size and it will also allocate
+ * the buffer.
+ * Use BEGIN_CB for arrays with a static size.
+ *
+ * Example:
+ *
+ *     uint32_t cb[3];
+ *     CB_LOCALS;
+ *
+ *     BEGIN_CB(cb, 3);
+ *     OUT_CB_REG_SEQ(R500_RB3D_CONSTANT_COLOR_AR, 2);
+ *     OUT_CB(blend_color_red_alpha);
+ *     OUT_CB(blend_color_green_blue);
+ *     END_CB;
+ *
+ * And later:
+ *
+ *     CS_LOCALS;
+ *     WRITE_CS_TABLE(cb, 3);
+ *
+ * Or using a little slower variant:
+ *
+ *     CS_LOCALS;
+ *     BEGIN_CS(cb, 3);
+ *     OUT_CS_TABLE(cb, 3);
+ *     END_CS;
+ */
+
+#ifndef R300_CB_H
+#define R300_CB_H
+
+#include "r300_reg.h"
+
+/* Yes, I know macros are ugly. However, they are much prettier than the code
+ * that they neatly hide away, and don't have the cost of function setup, so
+ * we're going to use them. */
+
+#ifdef DEBUG
+#define CB_DEBUG(x) x
+#else
+#define CB_DEBUG(x)
+#endif
+
+
+/**
+ * Command buffer setup.
+ */
+
+#define CB_LOCALS \
+    CB_DEBUG(int cs_count = 0;) \
+    uint32_t *cs_ptr = NULL; \
+    CB_DEBUG((void) cs_count;) (void) cs_ptr;
+
+#define NEW_CB(ptr, size) do { \
+    assert(sizeof(*ptr) == sizeof(uint32_t)); \
+    cs_ptr = (ptr) = (uint32_t*)malloc((size) * sizeof(uint32_t)); \
+    CB_DEBUG(cs_count = size;) \
+} while (0)
+
+#define BEGIN_CB(ptr, size) do { \
+    assert(sizeof(*ptr) == sizeof(uint32_t)); \
+    cs_ptr = ptr; \
+    CB_DEBUG(cs_count = size;) \
+} while (0)
+
+#define BEGIN_CS_AS_CB(r300, size) \
+    BEGIN_CB(r300->rws->get_cs_pointer(r300->rws, dwords), dwords)
+
+#define END_CB do { \
+    CB_DEBUG(if (cs_count != 0) \
+        debug_printf("r300: Warning: cs_count off by %d at (%s, %s:%i)\n", \
+                     cs_count, __FUNCTION__, __FILE__, __LINE__);) \
+} while (0)
+
+
+/**
+ * Storing pure DWORDs.
+ */
+
+#define OUT_CB(value) do { \
+    *cs_ptr = (value); \
+    cs_ptr++; \
+    CB_DEBUG(cs_count--;) \
+} while (0)
+
+#define OUT_CB_TABLE(values, count) do { \
+    memcpy(cs_ptr, values, count * sizeof(uint32_t)); \
+    cs_ptr += count; \
+    CB_DEBUG(cs_count -= count;) \
+} while (0)
+
+#define OUT_CB_32F(value) \
+    OUT_CB(fui(value));
+
+#define OUT_CB_REG(register, value) do { \
+    assert(register); \
+    OUT_CB(CP_PACKET0(register, 0)); \
+    OUT_CB(value); \
+} while (0)
+
+/* Note: This expects count to be the number of registers,
+ * not the actual packet0 count! */
+#define OUT_CB_REG_SEQ(register, count) do { \
+    assert(register); \
+    OUT_CB(CP_PACKET0(register, (count) - 1)); \
+} while (0)
+
+#define OUT_CB_ONE_REG(register, count) do { \
+    assert(register); \
+    OUT_CB(CP_PACKET0(register, (count) - 1) | RADEON_ONE_REG_WR); \
+} while (0)
+
+#define OUT_CB_PKT3(op, count) \
+    OUT_CB(CP_PACKET3(op, count))
+
+#endif /* R300_CB_H */
diff --git a/src/gallium/drivers/r300/r300_chipset.c b/src/gallium/drivers/r300/r300_chipset.c
index e6dca66..511aa7e 100644
--- a/src/gallium/drivers/r300/r300_chipset.c
+++ b/src/gallium/drivers/r300/r300_chipset.c
@@ -36,6 +36,7 @@
     caps->num_vert_fpus = 2;
     caps->num_tex_units = 16;
     caps->has_tcl = debug_get_bool_option("RADEON_NO_TCL", FALSE) ? FALSE : TRUE;
+    caps->has_hiz = TRUE;
     caps->is_r400 = FALSE;
     caps->is_r500 = FALSE;
     caps->high_second_pipe = FALSE;
@@ -76,6 +77,7 @@
         case 0x4E54:
         case 0x4E56:
             caps->family = CHIP_FAMILY_RV350;
+            caps->has_hiz = FALSE;
             caps->high_second_pipe = TRUE;
             break;
 
@@ -106,6 +108,7 @@
         case 0x5B64:
         case 0x5B65:
             caps->family = CHIP_FAMILY_RV370;
+            caps->has_hiz = FALSE;
             caps->high_second_pipe = TRUE;
             break;
 
diff --git a/src/gallium/drivers/r300/r300_chipset.h b/src/gallium/drivers/r300/r300_chipset.h
index ab649c3..65750f5 100644
--- a/src/gallium/drivers/r300/r300_chipset.h
+++ b/src/gallium/drivers/r300/r300_chipset.h
@@ -42,6 +42,8 @@
     unsigned num_tex_units;
     /* Whether or not TCL is physically present */
     boolean has_tcl;
+    /* Some chipsets do not have HiZ RAM. */
+    boolean has_hiz;
     /* Whether or not this is RV350 or newer, including all r400 and r500
      * chipsets. The differences compared to the oldest r300 chips are:
      * - Blend LTE/GTE thresholds
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index 88ce186..46d1ed9 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -23,9 +23,11 @@
 #include "draw/draw_context.h"
 
 #include "util/u_memory.h"
+#include "util/u_sampler.h"
 #include "util/u_simple_list.h"
 #include "util/u_upload_mgr.h"
 
+#include "r300_cb.h"
 #include "r300_context.h"
 #include "r300_emit.h"
 #include "r300_screen.h"
@@ -38,9 +40,15 @@
 static void r300_destroy_context(struct pipe_context* context)
 {
     struct r300_context* r300 = r300_context(context);
-    struct r300_query* query, * temp;
+    struct r300_query *query, *temp;
     struct r300_atom *atom;
 
+    if (r300->texkill_sampler) {
+        pipe_sampler_view_reference(
+                (struct pipe_sampler_view**)&r300->texkill_sampler,
+                NULL);
+    }
+
     util_blitter_destroy(r300->blitter);
     draw_destroy(r300->draw);
 
@@ -54,9 +62,6 @@
         }
     }
 
-    /* Free the OQ BO. */
-    context->screen->resource_destroy(context->screen, r300->oqbo);
-
     /* If there are any queries pending or not destroyed, remove them now. */
     foreach_s(query, temp, &r300->query_list) {
         remove_from_list(query);
@@ -66,9 +71,13 @@
     u_upload_destroy(r300->upload_vb);
     u_upload_destroy(r300->upload_ib);
 
+    translate_cache_destroy(r300->tran.translate_cache);
+
+    FREE(r300->aa_state.state);
     FREE(r300->blend_color_state.state);
     FREE(r300->clip_state.state);
     FREE(r300->fb_state.state);
+    FREE(r300->gpu_flush.state);
     FREE(r300->rs_block_state.state);
     FREE(r300->scissor_state.state);
     FREE(r300->textures_state.state);
@@ -110,27 +119,36 @@
      * Some atoms never change size, others change every emit - those have
      * the size of 0 here. */
     make_empty_list(&r300->atom_list);
-    R300_INIT_ATOM(invariant_state, 71);
-    R300_INIT_ATOM(query_start, 4);
+    /* RB3D (unpipelined), ZB (unpipelined), US, SC. */
+    R300_INIT_ATOM(gpu_flush, 9);
+    R300_INIT_ATOM(aa_state, 4);
+    R300_INIT_ATOM(fb_state, 0);
     R300_INIT_ATOM(ztop_state, 2);
+    R300_INIT_ATOM(dsa_state, is_r500 ? 8 : 6);
     R300_INIT_ATOM(blend_state, 8);
     R300_INIT_ATOM(blend_color_state, is_r500 ? 3 : 2);
-    R300_INIT_ATOM(clip_state, has_tcl ? 5 + (6 * 4) : 2);
-    R300_INIT_ATOM(dsa_state, is_r500 ? 8 : 6);
-    R300_INIT_ATOM(fb_state, 0);
-    R300_INIT_ATOM(rs_state, 0);
     R300_INIT_ATOM(scissor_state, 3);
+    /* All sorts of things. */
+    R300_INIT_ATOM(invariant_state, 22);
+    /* VAP. */
     R300_INIT_ATOM(viewport_state, 9);
-    R300_INIT_ATOM(rs_block_state, 0);
-    R300_INIT_ATOM(vertex_stream_state, 0);
     R300_INIT_ATOM(pvs_flush, 2);
+    R300_INIT_ATOM(vertex_stream_state, 0);
     R300_INIT_ATOM(vs_state, 0);
     R300_INIT_ATOM(vs_constants, 0);
-    R300_INIT_ATOM(texture_cache_inval, 2);
-    R300_INIT_ATOM(textures_state, 0);
+    R300_INIT_ATOM(clip_state, has_tcl ? 5 + (6 * 4) : 2);
+    /* VAP, RS, GA, GB. */
+    R300_INIT_ATOM(rs_block_state, 0);
+    R300_INIT_ATOM(rs_state, 0);
+    /* US. */
     R300_INIT_ATOM(fs, 0);
     R300_INIT_ATOM(fs_rc_constant_state, 0);
     R300_INIT_ATOM(fs_constants, 0);
+    /* TX. */
+    R300_INIT_ATOM(texture_cache_inval, 2);
+    R300_INIT_ATOM(textures_state, 0);
+    /* ZB (unpipelined), SU. */
+    R300_INIT_ATOM(query_start, 4);
 
     /* Replace emission functions for r500. */
     if (r300->screen->caps.is_r500) {
@@ -140,9 +158,11 @@
     }
 
     /* Some non-CSO atoms need explicit space to store the state locally. */
+    r300->aa_state.state = CALLOC_STRUCT(r300_aa_state);
     r300->blend_color_state.state = CALLOC_STRUCT(r300_blend_color_state);
-    r300->clip_state.state = CALLOC_STRUCT(pipe_clip_state);
+    r300->clip_state.state = CALLOC_STRUCT(r300_clip_state);
     r300->fb_state.state = CALLOC_STRUCT(pipe_framebuffer_state);
+    r300->gpu_flush.state = CALLOC_STRUCT(pipe_framebuffer_state);
     r300->rs_block_state.state = CALLOC_STRUCT(r300_rs_block);
     r300->scissor_state.state = CALLOC_STRUCT(pipe_scissor_state);
     r300->textures_state.state = CALLOC_STRUCT(r300_textures_state);
@@ -162,6 +182,52 @@
     r300->texture_cache_inval.allow_null_state = TRUE;
 }
 
+/* Not every state tracker calls every driver function before the first draw
+ * call and we must initialize the command buffers somehow. */
+static void r300_init_states(struct pipe_context *pipe)
+{
+    struct r300_context *r300 = r300_context(pipe);
+    struct pipe_blend_color bc = {{0}};
+    struct pipe_clip_state cs = {{{0}}};
+    struct pipe_scissor_state ss = {0};
+    struct r300_clip_state *clip =
+            (struct r300_clip_state*)r300->clip_state.state;
+    struct r300_gpu_flush *gpuflush =
+            (struct r300_gpu_flush*)r300->gpu_flush.state;
+    CB_LOCALS;
+
+    pipe->set_blend_color(pipe, &bc);
+    pipe->set_scissor_state(pipe, &ss);
+
+    /* Initialize the clip state. */
+    if (r300_context(pipe)->screen->caps.has_tcl) {
+        pipe->set_clip_state(pipe, &cs);
+    } else {
+        BEGIN_CB(clip->cb, 2);
+        OUT_CB_REG(R300_VAP_CLIP_CNTL, R300_CLIP_DISABLE);
+        END_CB;
+    }
+
+    /* Initialize the GPU flush. */
+    {
+        BEGIN_CB(gpuflush->cb_flush_clean, 6);
+
+        /* Flush and free renderbuffer caches. */
+        OUT_CB_REG(R300_RB3D_DSTCACHE_CTLSTAT,
+            R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
+            R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+        OUT_CB_REG(R300_ZB_ZCACHE_CTLSTAT,
+            R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
+            R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+
+        /* Wait until the GPU is idle.
+         * This fixes random pixels sometimes appearing probably caused
+         * by incomplete rendering. */
+        OUT_CB_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
+        END_CB;
+    }
+}
+
 struct pipe_context* r300_create_context(struct pipe_screen* screen,
                                          void *priv)
 {
@@ -195,9 +261,6 @@
 
     r300_setup_atoms(r300);
 
-    /* Open up the OQ BO. */
-    r300->oqbo = pipe_buffer_create(screen,
-				    R300_BIND_OQBO, 4096);
     make_empty_list(&r300->query_list);
 
     r300_init_blit_functions(r300);
@@ -227,6 +290,39 @@
     if (r300->upload_vb == NULL)
         goto no_upload_vb;
 
+    r300->tran.translate_cache = translate_cache_create();
+
+    r300_init_states(&r300->context);
+
+    /* The KIL opcode needs the first texture unit to be enabled
+     * on r3xx-r4xx. In order to calm down the CS checker, we bind this
+     * dummy texture there. */
+    if (!r300->screen->caps.is_r500) {
+        struct pipe_resource *tex;
+        struct pipe_resource rtempl = {{0}};
+        struct pipe_sampler_view vtempl = {{0}};
+
+        rtempl.target = PIPE_TEXTURE_2D;
+        rtempl.format = PIPE_FORMAT_I8_UNORM;
+        rtempl.bind = PIPE_BIND_SAMPLER_VIEW;
+        rtempl.width0 = 1;
+        rtempl.height0 = 1;
+        rtempl.depth0 = 1;
+        tex = screen->resource_create(screen, &rtempl);
+
+        u_sampler_view_default_template(&vtempl, tex, tex->format);
+
+        r300->texkill_sampler = (struct r300_sampler_view*)
+            r300->context.create_sampler_view(&r300->context, tex, &vtempl);
+
+        pipe_resource_reference(&tex, NULL);
+
+        /* This will make sure that the dummy texture is set up
+         * from the beginning even if an application does not use
+         * textures. */
+        r300->textures_state.dirty = TRUE;
+    }
+
     return &r300->context;
 
  no_upload_ib:
@@ -238,10 +334,7 @@
 
 boolean r300_check_cs(struct r300_context *r300, unsigned size)
 {
-    struct r300_cs_info cs_info;
-
-    r300->rws->get_cs_info(r300->rws, &cs_info);
-    return size <= cs_info.free;
+    return size <= r300->rws->get_cs_free_dwords(r300->rws);
 }
 
 void r300_finish(struct r300_context *r300)
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index cca11f8..976ef20 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -31,6 +31,8 @@
 #include "util/u_inlines.h"
 #include "util/u_transfer.h"
 
+#include "translate/translate_cache.h"
+
 #include "r300_defines.h"
 #include "r300_screen.h"
 
@@ -59,36 +61,54 @@
     boolean allow_null_state;
 };
 
+struct r300_aa_state {
+    struct r300_surface *dest;
+
+    uint32_t aa_config;
+    uint32_t aaresolve_ctl;
+};
+
 struct r300_blend_state {
-    uint32_t blend_control;       /* R300_RB3D_CBLEND: 0x4e04 */
-    uint32_t alpha_blend_control; /* R300_RB3D_ABLEND: 0x4e08 */
-    uint32_t color_channel_mask;  /* R300_RB3D_COLOR_CHANNEL_MASK: 0x4e0c */
-    uint32_t rop;                 /* R300_RB3D_ROPCNTL: 0x4e18 */
-    uint32_t dither;              /* R300_RB3D_DITHER_CTL: 0x4e50 */
+    uint32_t cb[8];
+    uint32_t cb_no_readwrite[8];
 };
 
 struct r300_blend_color_state {
-    /* RV515 and earlier */
-    uint32_t blend_color;            /* R300_RB3D_BLEND_COLOR: 0x4e10 */
-    /* R520 and newer */
-    uint32_t blend_color_red_alpha;  /* R500_RB3D_CONSTANT_COLOR_AR: 0x4ef8 */
-    uint32_t blend_color_green_blue; /* R500_RB3D_CONSTANT_COLOR_GB: 0x4efc */
+    uint32_t cb[3];
+};
+
+struct r300_clip_state {
+    struct pipe_clip_state clip;
+
+    uint32_t cb[29];
 };
 
 struct r300_dsa_state {
+    struct pipe_depth_stencil_alpha_state dsa;
+
+    /* This is actually a command buffer with named dwords. */
+    uint32_t cb_begin;
     uint32_t alpha_function;    /* R300_FG_ALPHA_FUNC: 0x4bd4 */
-    uint32_t alpha_reference;   /* R500_FG_ALPHA_VALUE: 0x4be0 */
+    uint32_t cb_reg_seq;
     uint32_t z_buffer_control;  /* R300_ZB_CNTL: 0x4f00 */
     uint32_t z_stencil_control; /* R300_ZB_ZSTENCILCNTL: 0x4f04 */
     uint32_t stencil_ref_mask;  /* R300_ZB_STENCILREFMASK: 0x4f08 */
+    uint32_t cb_reg;
     uint32_t stencil_ref_bf;    /* R500_ZB_STENCILREFMASK_BF: 0x4fd4 */
 
+    /* The second command buffer disables zbuffer reads and writes. */
+    uint32_t cb_no_readwrite[8];
+
     /* Whether a two-sided stencil is enabled. */
     boolean two_sided;
     /* Whether a fallback should be used for a two-sided stencil ref value. */
     boolean two_sided_stencil_ref;
 };
 
+struct r300_gpu_flush {
+    uint32_t cb_flush_clean[6];
+};
+
 struct r300_rs_state {
     /* Original rasterizer state. */
     struct pipe_rasterizer_state rs;
@@ -96,7 +116,8 @@
     struct pipe_rasterizer_state rs_draw;
 
     uint32_t vap_control_status;    /* R300_VAP_CNTL_STATUS: 0x2140 */
-    uint32_t antialiasing_config;   /* R300_GB_AA_CONFIG: 0x4020 */
+    uint32_t multisample_position_0;/* R300_GB_MSPOS0: 0x4010 */
+    uint32_t multisample_position_1;/* R300_GB_MSPOS1: 0x4014 */
     uint32_t point_size;            /* R300_GA_POINT_SIZE: 0x421c */
     uint32_t point_minmax;          /* R300_GA_POINT_MINMAX: 0x4230 */
     uint32_t line_control;          /* R300_GA_LINE_CNTL: 0x4234 */
@@ -163,16 +184,14 @@
     /* Copy of r300_texture::texture_format_state with format-specific bits
      * added. */
     struct r300_texture_format_state format;
+
+    /* The texture cache region for this texture. */
+    uint32_t texcache_region;
 };
 
 struct r300_texture_fb_state {
-    /* Colorbuffer. */
-    uint32_t colorpitch[R300_MAX_TEXTURE_LEVELS]; /* R300_RB3D_COLORPITCH[0-3]*/
-    uint32_t us_out_fmt; /* R300_US_OUT_FMT[0-3] */
-
-    /* Zbuffer. */
-    uint32_t depthpitch[R300_MAX_TEXTURE_LEVELS]; /* R300_RB3D_DEPTHPITCH */
-    uint32_t zb_format; /* R300_ZB_FORMAT */
+    uint32_t pitch[R300_MAX_TEXTURE_LEVELS]; /* COLORPITCH or DEPTHPITCH. */
+    uint32_t format; /* US_OUT_FMT or R300_ZB_FORMAT */
 };
 
 struct r300_texture_sampler_state {
@@ -224,7 +243,7 @@
 
 struct r300_constant_buffer {
     /* Buffer of constants */
-    float constants[256][4];
+    uint32_t constants[256][4];
     /* Total number of constants */
     unsigned count;
 };
@@ -239,14 +258,23 @@
 struct r300_query {
     /* The kind of query. Currently only OQ is supported. */
     unsigned type;
-    /* The current count of this query. Required to be at least 32 bits. */
-    unsigned int count;
-    /* The offset of this query into the query buffer, in bytes. */
-    unsigned offset;
+    /* The number of pipes where query results are stored. */
+    unsigned num_pipes;
+    /* How many results have been written, in dwords. It's incremented
+     * after end_query and flush. */
+    unsigned num_results;
     /* if we've flushed the query */
     boolean flushed;
     /* if begin has been emitted */
     boolean begin_emitted;
+
+    /* The buffer where query results are stored. */
+    struct r300_winsys_buffer *buffer;
+    /* The size of the buffer. */
+    unsigned buffer_size;
+    /* The domain of the buffer. */
+    enum r300_buffer_domain domain;
+
     /* Linked list members. */
     struct r300_query* prev;
     struct r300_query* next;
@@ -268,6 +296,19 @@
     boolean signalled;
 };
 
+struct r300_surface {
+    struct pipe_surface base;
+
+    /* Winsys buffer backing the texture. */
+    struct r300_winsys_buffer *buffer;
+
+    enum r300_buffer_domain domain;
+
+    uint32_t offset;
+    uint32_t pitch;     /* COLORPITCH or DEPTHPITCH. */
+    uint32_t format;    /* US_OUT_FMT or R300_ZB_FORMAT. */
+};
+
 struct r300_texture {
     /* Parent class */
     struct u_resource b;
@@ -332,6 +373,9 @@
     enum pipe_format hw_format[PIPE_MAX_ATTRIBS];
     unsigned hw_format_size[PIPE_MAX_ATTRIBS];
 
+    /* The size of the vertex, in dwords. */
+    unsigned vertex_size_dwords;
+
     /* This might mean two things:
      * - src_format != hw_format, as discussed above.
      * - src_offset % 4 != 0. */
@@ -340,6 +384,17 @@
     struct r300_vertex_stream_state vertex_stream;
 };
 
+struct r300_translate_context {
+    /* Translate cache for incompatible vertex offset/stride/format fallback. */
+    struct translate_cache *translate_cache;
+
+    /* The vertex buffer slot containing the translated buffer. */
+    unsigned vb_slot;
+
+    /* Saved and new vertex element state. */
+    void *saved_velems, *new_velems;
+};
+
 struct r300_context {
     /* Parent class */
     struct pipe_context context;
@@ -354,21 +409,30 @@
     struct blitter_context* blitter;
     /* Stencil two-sided reference value fallback. */
     struct r300_stencilref_context *stencilref_fallback;
+    /* For translating vertex buffers having incompatible vertex layout. */
+    struct r300_translate_context tran;
 
     /* Vertex buffer for rendering. */
     struct pipe_resource* vbo;
+    /* The KIL opcode needs the first texture unit to be enabled
+     * on r3xx-r4xx. In order to calm down the CS checker, we bind this
+     * dummy texture there. */
+    struct r300_sampler_view *texkill_sampler;
     /* Offset into the VBO. */
     size_t vbo_offset;
 
-    /* Occlusion query buffer. */
-    struct pipe_resource* oqbo;
-    /* Query list. */
+    /* The currently active query. */
     struct r300_query *query_current;
+    /* The saved query for blitter operations. */
+    struct r300_query *blitter_saved_query;
+    /* Query list. */
     struct r300_query query_list;
 
     /* Various CSO state objects. */
     /* Beginning of atom list. */
     struct r300_atom atom_list;
+    /* Anti-aliasing (MSAA) state. */
+    struct r300_atom aa_state;
     /* Blend state. */
     struct r300_atom blend_state;
     /* Blend color state. */
@@ -409,6 +473,8 @@
     struct r300_atom pvs_flush;
     /* Texture cache invalidate. */
     struct r300_atom texture_cache_inval;
+    /* GPU flush. */
+    struct r300_atom gpu_flush;
 
     /* Invariant state. This must be emitted to get the engine started. */
     struct r300_atom invariant_state;
@@ -425,9 +491,6 @@
     struct vertex_info vertex_info;
 
     struct pipe_stencil_ref stencil_ref;
-
-    struct pipe_clip_state clip;
-
     struct pipe_viewport_state viewport;
 
     /* Stream locations for SWTCL. */
@@ -462,6 +525,11 @@
     return (struct r300_query*)q;
 }
 
+static INLINE struct r300_surface* r300_surface(struct pipe_surface* surf)
+{
+    return (struct r300_surface*)surf;
+}
+
 static INLINE struct r300_texture* r300_texture(struct pipe_resource* tex)
 {
     return (struct r300_texture*)tex;
@@ -480,6 +548,9 @@
 struct pipe_context* r300_create_context(struct pipe_screen* screen,
                                          void *priv);
 
+boolean r300_check_cs(struct r300_context *r300, unsigned size);
+void r300_finish(struct r300_context *r300);
+
 /* Context initialization. */
 struct draw_stage* r300_draw_stage(struct r300_context* r300);
 void r300_init_blit_functions(struct r300_context *r300);
@@ -489,10 +560,29 @@
 void r300_init_state_functions(struct r300_context* r300);
 void r300_init_resource_functions(struct r300_context* r300);
 
-boolean r300_check_cs(struct r300_context *r300, unsigned size);
-void r300_finish(struct r300_context *r300);
+/* r300_query.c */
+void r300_resume_query(struct r300_context *r300,
+                       struct r300_query *query);
+void r300_stop_query(struct r300_context *r300);
+
+/* r300_render_translate.c */
+void r300_begin_vertex_translate(struct r300_context *r300);
+void r300_end_vertex_translate(struct r300_context *r300);
+void r300_translate_index_buffer(struct r300_context *r300,
+                                 struct pipe_resource **index_buffer,
+                                 unsigned *index_size, unsigned index_offset,
+                                 unsigned *start, unsigned count);
+
+/* r300_render_stencilref.c */
+void r300_plug_in_stencil_ref_fallback(struct r300_context *r300);
+
+/* r300_state.c */
+void r300_mark_fs_code_dirty(struct r300_context *r300);
+
+/* r300_debug.c */
 void r500_dump_rs_block(struct r300_rs_block *rs);
 
+
 static INLINE boolean CTX_DBG_ON(struct r300_context * ctx, unsigned flags)
 {
     return SCREEN_DBG_ON(ctx->screen, flags);
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index 9c8c273..1db7da6 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -20,171 +20,133 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+/**
+ * This file contains macros for immediate command submission.
+ */
+
 #ifndef R300_CS_H
 #define R300_CS_H
 
-#include "util/u_math.h"
-
 #include "r300_reg.h"
+#include "r300_context.h"
 #include "r300_winsys.h"
 
 /* Yes, I know macros are ugly. However, they are much prettier than the code
  * that they neatly hide away, and don't have the cost of function setup,so
  * we're going to use them. */
 
-#define MAX_CS_SIZE 64 * 1024 / 4
+#ifdef DEBUG
+#define CS_DEBUG(x) x
+#else
+#define CS_DEBUG(x)
+#endif
 
-#define VERY_VERBOSE_CS 1
-#define VERY_VERBOSE_REGISTERS 1
-
-/* XXX stolen from radeon_reg.h */
-#define RADEON_CP_PACKET0 0x0
-
-#define CP_PACKET0(register, count) \
-    (RADEON_CP_PACKET0 | ((count) << 16) | ((register) >> 2))
+/**
+ * Command submission setup.
+ */
 
 #define CS_LOCALS(context) \
     struct r300_context* const cs_context_copy = (context); \
     struct r300_winsys_screen *cs_winsys = cs_context_copy->rws; \
-    int cs_count = 0; (void) cs_count;
-
-#define CHECK_CS(size) \
-    assert(r300_check_cs(cs_context_copy, (size)))
+    CS_DEBUG(int cs_count = 0; (void) cs_count;)
 
 #define BEGIN_CS(size) do { \
-    CHECK_CS(size); \
-    if (VERY_VERBOSE_CS) { \
-        DBG(cs_context_copy, DBG_CS, "r300: BEGIN_CS, count %d, in %s (%s:%d)\n", \
-                size, __FUNCTION__, __FILE__, __LINE__); \
-    } \
-    cs_winsys->begin_cs(cs_winsys, (size), \
-            __FILE__, __FUNCTION__, __LINE__); \
-    cs_count = size; \
+    assert(r300_check_cs(cs_context_copy, (size))); \
+    CS_DEBUG(cs_count = size;) \
 } while (0)
 
+#ifdef DEBUG
+#define END_CS do { \
+    if (cs_count != 0) \
+        debug_printf("r300: Warning: cs_count off by %d at (%s, %s:%i)\n", \
+                     cs_count, __FUNCTION__, __FILE__, __LINE__); \
+    cs_count = 0; \
+} while (0)
+#else
+#define END_CS
+#endif
+
+/**
+ * Writing pure DWORDs.
+ */
+
 #define OUT_CS(value) do { \
-    if (VERY_VERBOSE_CS || VERY_VERBOSE_REGISTERS) { \
-        DBG(cs_context_copy, DBG_CS, "r300: writing %08x\n", value); \
-    } \
     cs_winsys->write_cs_dword(cs_winsys, (value)); \
-    cs_count--; \
+    CS_DEBUG(cs_count--;) \
 } while (0)
 
 #define OUT_CS_32F(value) do { \
-    if (VERY_VERBOSE_CS || VERY_VERBOSE_REGISTERS) { \
-        DBG(cs_context_copy, DBG_CS, "r300: writing %f\n", value); \
-    } \
     cs_winsys->write_cs_dword(cs_winsys, fui(value)); \
-    cs_count--; \
+    CS_DEBUG(cs_count--;) \
 } while (0)
 
 #define OUT_CS_REG(register, value) do { \
-    if (VERY_VERBOSE_REGISTERS) \
-        DBG(cs_context_copy, DBG_CS, "r300: writing 0x%08X to register 0x%04X\n", \
-            value, register); \
     assert(register); \
     cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0(register, 0)); \
     cs_winsys->write_cs_dword(cs_winsys, value); \
-    cs_count -= 2; \
+    CS_DEBUG(cs_count -= 2;) \
 } while (0)
 
 /* Note: This expects count to be the number of registers,
  * not the actual packet0 count! */
 #define OUT_CS_REG_SEQ(register, count) do { \
-    if (VERY_VERBOSE_REGISTERS) \
-        DBG(cs_context_copy, DBG_CS, "r300: writing register sequence of %d to 0x%04X\n", \
-            count, register); \
     assert(register); \
     cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0((register), ((count) - 1))); \
-    cs_count--; \
+    CS_DEBUG(cs_count--;) \
 } while (0)
 
 #define OUT_CS_TABLE(values, count) do { \
-    if (VERY_VERBOSE_REGISTERS) \
-        DBG(cs_context_copy, DBG_CS, "r300: writing table of %d dwords\n", count); \
     cs_winsys->write_cs_table(cs_winsys, values, count); \
-    cs_count -= count; \
+    CS_DEBUG(cs_count -= count;) \
 } while (0)
 
-#define OUT_CS_BUF_RELOC(bo, offset, rd, wd, flags) do { \
-    DBG(cs_context_copy, DBG_CS, "r300: writing relocation for buffer %p, offset %d, " \
-            "domains (%d, %d, %d)\n", \
-        bo, offset, rd, wd, flags); \
-    assert(bo); \
-    cs_winsys->write_cs_dword(cs_winsys, offset); \
-    r300_buffer_write_reloc(cs_winsys, r300_buffer(bo), rd, wd, flags);	\
-    cs_count -= 3; \
-} while (0)
-
-
-#define OUT_CS_TEX_RELOC(tex, offset, rd, wd, flags) do { \
-    DBG(cs_context_copy, DBG_CS, "r300: writing relocation for texture %p, offset %d, " \
-            "domains (%d, %d, %d)\n", \
-        tex, offset, rd, wd, flags); \
-    assert(tex); \
-    cs_winsys->write_cs_dword(cs_winsys, offset); \
-    r300_texture_write_reloc(cs_winsys, tex, rd, wd, flags);	\
-    cs_count -= 3; \
-} while (0)
-
-
-#define OUT_CS_BUF_RELOC_NO_OFFSET(bo, rd, wd, flags) do { \
-    DBG(cs_context_copy, DBG_CS, "r300: writing relocation for buffer %p, " \
-            "domains (%d, %d, %d)\n", \
-        bo, rd, wd, flags); \
-    assert(bo); \
-    r300_buffer_write_reloc(cs_winsys, r300_buffer(bo), rd, wd, flags);	\
-    cs_count -= 2; \
-} while (0)
-
-#define END_CS do { \
-    if (VERY_VERBOSE_CS) { \
-        DBG(cs_context_copy, DBG_CS, "r300: END_CS in %s (%s:%d)\n", __FUNCTION__, \
-                __FILE__, __LINE__); \
-    } \
-    if (cs_count != 0) \
-        debug_printf("r300: Warning: cs_count off by %d\n", cs_count); \
-    cs_winsys->end_cs(cs_winsys, __FILE__, __FUNCTION__, __LINE__); \
-} while (0)
-
-#define FLUSH_CS do { \
-    if (VERY_VERBOSE_CS) { \
-        DBG(cs_context_copy, DBG_CS, "r300: FLUSH_CS in %s (%s:%d)\n\n", __FUNCTION__, \
-                __FILE__, __LINE__); \
-    } \
-    if (SCREEN_DBG_ON(r300->screen, DBG_STATS)) { \
-        r300->flush_counter++; \
-    } \
-    cs_winsys->flush_cs(cs_winsys); \
-} while (0)
-
-#define RADEON_ONE_REG_WR        (1 << 15)
-
 #define OUT_CS_ONE_REG(register, count) do { \
-    if (VERY_VERBOSE_REGISTERS) \
-        DBG(cs_context_copy, DBG_CS, "r300: writing data sequence of %d to 0x%04X\n", \
-            count, register); \
     assert(register); \
     cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0((register), ((count) - 1)) | RADEON_ONE_REG_WR); \
-    cs_count--; \
+    CS_DEBUG(cs_count--;) \
 } while (0)
 
-#define CP_PACKET3(op, count) \
-    (RADEON_CP_PACKET3 | (op) | ((count) << 16))
-
 #define OUT_CS_PKT3(op, count) do { \
     cs_winsys->write_cs_dword(cs_winsys, CP_PACKET3(op, count)); \
-    cs_count--; \
+    CS_DEBUG(cs_count--;) \
 } while (0)
 
-#define OUT_CS_INDEX_RELOC(bo, offset, count, rd, wd, flags) do { \
-    DBG(cs_context_copy, DBG_CS, "r300: writing relocation for index buffer %p," \
-            "offset %d\n", bo, offset); \
+
+/**
+ * Writing relocations.
+ */
+
+#define OUT_CS_RELOC(bo, offset, rd, wd, flags) do { \
     assert(bo); \
     cs_winsys->write_cs_dword(cs_winsys, offset); \
-    cs_winsys->write_cs_dword(cs_winsys, count); \
     cs_winsys->write_cs_reloc(cs_winsys, bo, rd, wd, flags); \
-    cs_count -= 4; \
+    CS_DEBUG(cs_count -= 3;) \
+} while (0)
+
+#define OUT_CS_BUF_RELOC(bo, offset, rd, wd, flags) do { \
+    assert(bo); \
+    OUT_CS_RELOC(r300_buffer(bo)->buf, offset, rd, wd, flags); \
+} while (0)
+
+#define OUT_CS_TEX_RELOC(tex, offset, rd, wd, flags) do { \
+    assert(tex); \
+    OUT_CS_RELOC(tex->buffer, offset, rd, wd, flags); \
+} while (0)
+
+#define OUT_CS_BUF_RELOC_NO_OFFSET(bo, rd, wd, flags) do { \
+    assert(bo); \
+    cs_winsys->write_cs_reloc(cs_winsys, r300_buffer(bo)->buf, rd, wd, flags); \
+    CS_DEBUG(cs_count -= 2;) \
+} while (0)
+
+
+/**
+ * Command buffer emission.
+ */
+
+#define WRITE_CS_TABLE(values, count) do { \
+    CS_DEBUG(assert(cs_count == 0);) \
+    cs_winsys->write_cs_table(cs_winsys, values, count); \
 } while (0)
 
 #endif /* R300_CS_H */
diff --git a/src/gallium/drivers/r300/r300_debug.c b/src/gallium/drivers/r300/r300_debug.c
index 297791f..a6cd86e 100644
--- a/src/gallium/drivers/r300/r300_debug.c
+++ b/src/gallium/drivers/r300/r300_debug.c
@@ -29,7 +29,6 @@
 static const struct debug_named_value debug_options[] = {
     { "fp", DBG_FP, "Fragment program handling (for debugging)" },
     { "vp", DBG_VP, "Vertex program handling (for debugging)" },
-    { "cs", DBG_CS, "Command submissions (for debugging)" },
     { "draw", DBG_DRAW, "Draw and emit (for debugging)" },
     { "tex", DBG_TEX, "Textures (for debugging)" },
     { "texalloc", DBG_TEXALLOC, "Texture allocation (for debugging)" },
@@ -39,6 +38,7 @@
     { "anisohq", DBG_ANISOHQ, "High quality anisotropic filtering (for benchmarking)" },
     { "notiling", DBG_NO_TILING, "Disable tiling (for benchmarking)" },
     { "noimmd", DBG_NO_IMMD, "Disable immediate mode (for benchmarking)" },
+    { "fakeocc", DBG_FAKE_OCC, "Use fake occlusion queries (for lulz)" },
     { "stats", DBG_STATS, "Gather statistics (for lulz)" },
 
     /* must be last */
diff --git a/src/gallium/drivers/r300/r300_defines.h b/src/gallium/drivers/r300/r300_defines.h
index 565a2f3..d510d80 100644
--- a/src/gallium/drivers/r300/r300_defines.h
+++ b/src/gallium/drivers/r300/r300_defines.h
@@ -32,12 +32,6 @@
 
 #define R300_INVALID_FORMAT 0xffff
 
-/* XXX: this is just a bandaid on larger problems in
- * r300_screen_buffer.h which doesn't seem to be fully ported to
- * gallium-resources.
- */
-#define R300_BIND_OQBO  (1<<21)
-
 /* Tiling flags. */
 enum r300_buffer_tiling {
     R300_BUFFER_LINEAR = 0,
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index dd9bca8..16cb168 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -43,21 +43,11 @@
         (struct pipe_framebuffer_state*)r300->fb_state.state;
     CS_LOCALS(r300);
 
-    BEGIN_CS(size);
-    OUT_CS_REG(R300_RB3D_ROPCNTL, blend->rop);
-    OUT_CS_REG_SEQ(R300_RB3D_CBLEND, 3);
     if (fb->nr_cbufs) {
-        OUT_CS(blend->blend_control);
-        OUT_CS(blend->alpha_blend_control);
-        OUT_CS(blend->color_channel_mask);
+        WRITE_CS_TABLE(blend->cb, size);
     } else {
-        OUT_CS(0);
-        OUT_CS(0);
-        OUT_CS(0);
-        /* XXX also disable fastfill here once it's supported */
+        WRITE_CS_TABLE(blend->cb_no_readwrite, size);
     }
-    OUT_CS_REG(R300_RB3D_DITHER_CTL, blend->dither);
-    END_CS;
 }
 
 void r300_emit_blend_color_state(struct r300_context* r300,
@@ -66,40 +56,16 @@
     struct r300_blend_color_state* bc = (struct r300_blend_color_state*)state;
     CS_LOCALS(r300);
 
-    if (r300->screen->caps.is_r500) {
-        BEGIN_CS(size);
-        OUT_CS_REG_SEQ(R500_RB3D_CONSTANT_COLOR_AR, 2);
-        OUT_CS(bc->blend_color_red_alpha);
-        OUT_CS(bc->blend_color_green_blue);
-        END_CS;
-    } else {
-        BEGIN_CS(size);
-        OUT_CS_REG(R300_RB3D_BLEND_COLOR, bc->blend_color);
-        END_CS;
-    }
+    WRITE_CS_TABLE(bc->cb, size);
 }
 
 void r300_emit_clip_state(struct r300_context* r300,
                           unsigned size, void* state)
 {
-    struct pipe_clip_state* clip = (struct pipe_clip_state*)state;
+    struct r300_clip_state* clip = (struct r300_clip_state*)state;
     CS_LOCALS(r300);
 
-    if (r300->screen->caps.has_tcl) {
-        BEGIN_CS(size);
-        OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG,
-                (r300->screen->caps.is_r500 ?
-                 R500_PVS_UCP_START : R300_PVS_UCP_START));
-        OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, 6 * 4);
-        OUT_CS_TABLE(clip->ucp, 6 * 4);
-        OUT_CS_REG(R300_VAP_CLIP_CNTL, ((1 << clip->nr) - 1) |
-                R300_PS_UCP_MODE_CLIP_AS_TRIFAN);
-        END_CS;
-    } else {
-        BEGIN_CS(size);
-        OUT_CS_REG(R300_VAP_CLIP_CNTL, R300_CLIP_DISABLE);
-        END_CS;
-    }
+    WRITE_CS_TABLE(clip->cb, size);
 }
 
 void r300_emit_dsa_state(struct r300_context* r300, unsigned size, void* state)
@@ -107,27 +73,13 @@
     struct r300_dsa_state* dsa = (struct r300_dsa_state*)state;
     struct pipe_framebuffer_state* fb =
         (struct pipe_framebuffer_state*)r300->fb_state.state;
-    struct pipe_stencil_ref stencil_ref = r300->stencil_ref;
     CS_LOCALS(r300);
 
-    BEGIN_CS(size);
-    OUT_CS_REG(R300_FG_ALPHA_FUNC, dsa->alpha_function);
-    OUT_CS_REG_SEQ(R300_ZB_CNTL, 3);
-
     if (fb->zsbuf) {
-        OUT_CS(dsa->z_buffer_control);
-        OUT_CS(dsa->z_stencil_control);
+        WRITE_CS_TABLE(&dsa->cb_begin, size);
     } else {
-        OUT_CS(0);
-        OUT_CS(0);
+        WRITE_CS_TABLE(dsa->cb_no_readwrite, size);
     }
-
-    OUT_CS(dsa->stencil_ref_mask | stencil_ref.ref_value[0]);
-
-    if (r300->screen->caps.is_r500) {
-        OUT_CS_REG(R500_ZB_STENCILREFMASK_BF, dsa->stencil_ref_bf | stencil_ref.ref_value[1]);
-    }
-    END_CS;
 }
 
 static const float * get_rc_constant_state(
@@ -175,7 +127,7 @@
 /* Convert a normal single-precision float into the 7.16 format
  * used by the R300 fragment shader.
  */
-static uint32_t pack_float24(float f)
+uint32_t pack_float24(float f)
 {
     union {
         float fl;
@@ -206,101 +158,27 @@
     return float24;
 }
 
-unsigned r300_get_fs_atom_size(struct r300_context *r300)
-{
-    struct r300_fragment_shader *fs = r300_fs(r300);
-    unsigned imm_count = fs->shader->immediates_count;
-    struct r300_fragment_program_code *code = &fs->shader->code.code.r300;
-
-    return 19 +
-           code->alu.length * 4 +
-           (code->tex.length ? (1 + code->tex.length) : 0) +
-           (imm_count ? imm_count * 5 : 0);
-}
-
 void r300_emit_fs(struct r300_context* r300, unsigned size, void *state)
 {
     struct r300_fragment_shader *fs = r300_fs(r300);
-    struct rX00_fragment_program_code* generic_code = &fs->shader->code;
-    struct r300_fragment_program_code * code = &generic_code->code.r300;
-    unsigned i;
-    unsigned imm_count = fs->shader->immediates_count;
-    unsigned imm_first = fs->shader->externals_count;
-    unsigned imm_end = generic_code->constants.Count;
-    struct rc_constant *constants = generic_code->constants.Constants;
     CS_LOCALS(r300);
 
-    BEGIN_CS(size);
-    OUT_CS_REG(R300_US_CONFIG, code->config);
-    OUT_CS_REG(R300_US_PIXSIZE, code->pixsize);
-    OUT_CS_REG(R300_US_CODE_OFFSET, code->code_offset);
-
-    OUT_CS_REG_SEQ(R300_US_CODE_ADDR_0, 4);
-    OUT_CS_TABLE(code->code_addr, 4);
-
-    OUT_CS_REG_SEQ(R300_US_ALU_RGB_INST_0, code->alu.length);
-    for (i = 0; i < code->alu.length; i++)
-        OUT_CS(code->alu.inst[i].rgb_inst);
-
-    OUT_CS_REG_SEQ(R300_US_ALU_RGB_ADDR_0, code->alu.length);
-    for (i = 0; i < code->alu.length; i++)
-        OUT_CS(code->alu.inst[i].rgb_addr);
-
-    OUT_CS_REG_SEQ(R300_US_ALU_ALPHA_INST_0, code->alu.length);
-    for (i = 0; i < code->alu.length; i++)
-        OUT_CS(code->alu.inst[i].alpha_inst);
-
-    OUT_CS_REG_SEQ(R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
-    for (i = 0; i < code->alu.length; i++)
-        OUT_CS(code->alu.inst[i].alpha_addr);
-
-    if (code->tex.length) {
-        OUT_CS_REG_SEQ(R300_US_TEX_INST_0, code->tex.length);
-        OUT_CS_TABLE(code->tex.inst, code->tex.length);
-    }
-
-    /* Emit immediates. */
-    if (imm_count) {
-        for(i = imm_first; i < imm_end; ++i) {
-            if (constants[i].Type == RC_CONSTANT_IMMEDIATE) {
-                const float *data = constants[i].u.Immediate;
-
-                OUT_CS_REG_SEQ(R300_PFS_PARAM_0_X + i * 16, 4);
-                OUT_CS(pack_float24(data[0]));
-                OUT_CS(pack_float24(data[1]));
-                OUT_CS(pack_float24(data[2]));
-                OUT_CS(pack_float24(data[3]));
-            }
-        }
-    }
-
-    OUT_CS_REG(R300_FG_DEPTH_SRC, fs->shader->fg_depth_src);
-    OUT_CS_REG(R300_US_W_FMT, fs->shader->us_out_w);
-    END_CS;
+    WRITE_CS_TABLE(fs->shader->cb_code, fs->shader->cb_code_size);
 }
 
 void r300_emit_fs_constants(struct r300_context* r300, unsigned size, void *state)
 {
     struct r300_fragment_shader *fs = r300_fs(r300);
-    struct rc_constant_list *constants = &fs->shader->code.constants;
     struct r300_constant_buffer *buf = (struct r300_constant_buffer*)state;
-    unsigned i, count = fs->shader->externals_count;
+    unsigned count = fs->shader->externals_count * 4;
     CS_LOCALS(r300);
 
     if (count == 0)
         return;
 
     BEGIN_CS(size);
-    OUT_CS_REG_SEQ(R300_PFS_PARAM_0_X, count * 4);
-    for(i = 0; i < count; ++i) {
-        const float *data;
-        assert(constants->Constants[i].Type == RC_CONSTANT_EXTERNAL);
-        data = buf->constants[i];
-        OUT_CS(pack_float24(data[0]));
-        OUT_CS(pack_float24(data[1]));
-        OUT_CS(pack_float24(data[2]));
-        OUT_CS(pack_float24(data[3]));
-    }
+    OUT_CS_REG_SEQ(R300_PFS_PARAM_0_X, count);
+    OUT_CS_TABLE(buf->constants, count);
     END_CS;
 }
 
@@ -312,6 +190,8 @@
     unsigned count = fs->shader->rc_state_count;
     unsigned first = fs->shader->externals_count;
     unsigned end = constants->Count;
+    uint32_t cdata[4];
+    unsigned j;
     CS_LOCALS(r300);
 
     if (count == 0)
@@ -323,85 +203,29 @@
             const float *data =
                     get_rc_constant_state(r300, &constants->Constants[i]);
 
+            for (j = 0; j < 4; j++)
+                cdata[j] = pack_float24(data[j]);
+
             OUT_CS_REG_SEQ(R300_PFS_PARAM_0_X + i * 16, 4);
-            OUT_CS(pack_float24(data[0]));
-            OUT_CS(pack_float24(data[1]));
-            OUT_CS(pack_float24(data[2]));
-            OUT_CS(pack_float24(data[3]));
+            OUT_CS_TABLE(cdata, 4);
         }
     }
     END_CS;
 }
 
-unsigned r500_get_fs_atom_size(struct r300_context *r300)
-{
-    struct r300_fragment_shader *fs = r300_fs(r300);
-    unsigned imm_count = fs->shader->immediates_count;
-    struct r500_fragment_program_code *code = &fs->shader->code.code.r500;
-
-    return 17 +
-           ((code->inst_end + 1) * 6) +
-           (imm_count ? imm_count * 7 : 0);
-}
-
 void r500_emit_fs(struct r300_context* r300, unsigned size, void *state)
 {
     struct r300_fragment_shader *fs = r300_fs(r300);
-    struct rX00_fragment_program_code* generic_code = &fs->shader->code;
-    struct r500_fragment_program_code * code = &generic_code->code.r500;
-    unsigned i;
-    unsigned imm_count = fs->shader->immediates_count;
-    unsigned imm_first = fs->shader->externals_count;
-    unsigned imm_end = generic_code->constants.Count;
-    struct rc_constant *constants = generic_code->constants.Constants;
     CS_LOCALS(r300);
 
-    BEGIN_CS(size);
-    OUT_CS_REG(R500_US_CONFIG, R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
-    OUT_CS_REG(R500_US_PIXSIZE, code->max_temp_idx);
-    OUT_CS_REG(R500_US_CODE_RANGE,
-               R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(code->inst_end));
-    OUT_CS_REG(R500_US_CODE_OFFSET, 0);
-    OUT_CS_REG(R500_US_CODE_ADDR,
-               R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(code->inst_end));
-
-    OUT_CS_REG(R500_GA_US_VECTOR_INDEX, R500_GA_US_VECTOR_INDEX_TYPE_INSTR);
-    OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, (code->inst_end + 1) * 6);
-    for (i = 0; i <= code->inst_end; i++) {
-        OUT_CS(code->inst[i].inst0);
-        OUT_CS(code->inst[i].inst1);
-        OUT_CS(code->inst[i].inst2);
-        OUT_CS(code->inst[i].inst3);
-        OUT_CS(code->inst[i].inst4);
-        OUT_CS(code->inst[i].inst5);
-    }
-
-    /* Emit immediates. */
-    if (imm_count) {
-        for(i = imm_first; i < imm_end; ++i) {
-            if (constants[i].Type == RC_CONSTANT_IMMEDIATE) {
-                const float *data = constants[i].u.Immediate;
-
-                OUT_CS_REG(R500_GA_US_VECTOR_INDEX,
-                           R500_GA_US_VECTOR_INDEX_TYPE_CONST |
-                           (i & R500_GA_US_VECTOR_INDEX_MASK));
-                OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, 4);
-                OUT_CS_TABLE(data, 4);
-            }
-        }
-    }
-
-    OUT_CS_REG(R300_FG_DEPTH_SRC, fs->shader->fg_depth_src);
-    OUT_CS_REG(R300_US_W_FMT, fs->shader->us_out_w);
-    END_CS;
+    WRITE_CS_TABLE(fs->shader->cb_code, fs->shader->cb_code_size);
 }
 
 void r500_emit_fs_constants(struct r300_context* r300, unsigned size, void *state)
 {
     struct r300_fragment_shader *fs = r300_fs(r300);
-    struct rc_constant_list *constants = &fs->shader->code.constants;
     struct r300_constant_buffer *buf = (struct r300_constant_buffer*)state;
-    unsigned i, count = fs->shader->externals_count;
+    unsigned count = fs->shader->externals_count * 4;
     CS_LOCALS(r300);
 
     if (count == 0)
@@ -409,11 +233,8 @@
 
     BEGIN_CS(size);
     OUT_CS_REG(R500_GA_US_VECTOR_INDEX, R500_GA_US_VECTOR_INDEX_TYPE_CONST);
-    OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, count * 4);
-    for(i = 0; i < count; ++i) {
-        assert(constants->Constants[i].Type == RC_CONSTANT_EXTERNAL);
-    }
-    OUT_CS_TABLE(buf->constants, count * 4);
+    OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, count);
+    OUT_CS_TABLE(buf->constants, count);
     END_CS;
 }
 
@@ -446,73 +267,17 @@
     END_CS;
 }
 
-void r300_emit_fb_state(struct r300_context* r300, unsigned size, void* state)
+void r300_emit_gpu_flush(struct r300_context *r300, unsigned size, void *state)
 {
-    struct pipe_framebuffer_state* fb = (struct pipe_framebuffer_state*)state;
-    struct r300_texture* tex;
-    struct pipe_surface* surf;
-    int i;
+    struct r300_gpu_flush *gpuflush = (struct r300_gpu_flush*)state;
+    struct pipe_framebuffer_state* fb =
+            (struct pipe_framebuffer_state*)r300->fb_state.state;
     CS_LOCALS(r300);
 
     BEGIN_CS(size);
 
-    /* Flush and free renderbuffer caches. */
-    OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT,
-        R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
-        R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
-    OUT_CS_REG(R300_ZB_ZCACHE_CTLSTAT,
-        R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
-        R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
-
-    /* Set the number of colorbuffers. */
-    if (fb->nr_cbufs > 1) {
-        if (r300->screen->caps.is_r500) {
-            OUT_CS_REG(R300_RB3D_CCTL,
-                R300_RB3D_CCTL_NUM_MULTIWRITES(fb->nr_cbufs) |
-                R300_RB3D_CCTL_INDEPENDENT_COLORFORMAT_ENABLE_ENABLE);
-        } else {
-            OUT_CS_REG(R300_RB3D_CCTL,
-                R300_RB3D_CCTL_NUM_MULTIWRITES(fb->nr_cbufs));
-        }
-    } else {
-        OUT_CS_REG(R300_RB3D_CCTL, 0x0);
-    }
-
-    /* Set up colorbuffers. */
-    for (i = 0; i < fb->nr_cbufs; i++) {
-        surf = fb->cbufs[i];
-        tex = r300_texture(surf->texture);
-        assert(tex && tex->buffer && "cbuf is marked, but NULL!");
-
-        OUT_CS_REG_SEQ(R300_RB3D_COLOROFFSET0 + (4 * i), 1);
-        OUT_CS_TEX_RELOC(tex, surf->offset, 0, tex->domain, 0);
-
-        OUT_CS_REG_SEQ(R300_RB3D_COLORPITCH0 + (4 * i), 1);
-        OUT_CS_TEX_RELOC(tex, tex->fb_state.colorpitch[surf->level],
-                     0, tex->domain, 0);
-
-        OUT_CS_REG(R300_US_OUT_FMT_0 + (4 * i), tex->fb_state.us_out_fmt);
-    }
-    for (; i < 4; i++) {
-        OUT_CS_REG(R300_US_OUT_FMT_0 + (4 * i), R300_US_OUT_FMT_UNUSED);
-    }
-
-    /* Set up a zbuffer. */
-    if (fb->zsbuf) {
-        surf = fb->zsbuf;
-        tex = r300_texture(surf->texture);
-        assert(tex && tex->buffer && "zsbuf is marked, but NULL!");
-
-        OUT_CS_REG_SEQ(R300_ZB_DEPTHOFFSET, 1);
-        OUT_CS_TEX_RELOC(tex, surf->offset, 0, tex->domain, 0);
-
-        OUT_CS_REG(R300_ZB_FORMAT, tex->fb_state.zb_format);
-
-        OUT_CS_REG_SEQ(R300_ZB_DEPTHPITCH, 1);
-        OUT_CS_TEX_RELOC(tex, tex->fb_state.depthpitch[surf->level],
-                     0, tex->domain, 0);
-    }
-
+    /* Set up scissors.
+     * By writing to the SC registers, SC & US assert idle. */
     OUT_CS_REG_SEQ(R300_SC_SCISSORS_TL, 2);
     if (r300->screen->caps.is_r500) {
         OUT_CS(0);
@@ -524,6 +289,96 @@
         OUT_CS(((fb->width  + 1440-1) << R300_SCISSORS_X_SHIFT) |
                ((fb->height + 1440-1) << R300_SCISSORS_Y_SHIFT));
     }
+
+    /* Flush CB & ZB caches and wait until the 3D engine is idle and clean. */
+    OUT_CS_TABLE(gpuflush->cb_flush_clean, 6);
+    END_CS;
+}
+
+void r300_emit_aa_state(struct r300_context *r300, unsigned size, void *state)
+{
+    struct r300_aa_state *aa = (struct r300_aa_state*)state;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(size);
+    OUT_CS_REG(R300_GB_AA_CONFIG, aa->aa_config);
+
+    if (aa->dest) {
+        OUT_CS_REG_SEQ(R300_RB3D_AARESOLVE_OFFSET, 1);
+        OUT_CS_RELOC(aa->dest->buffer, aa->dest->offset, 0, aa->dest->domain, 0);
+
+        OUT_CS_REG_SEQ(R300_RB3D_AARESOLVE_PITCH, 1);
+        OUT_CS_RELOC(aa->dest->buffer, aa->dest->pitch, 0, aa->dest->domain, 0);
+    }
+
+    OUT_CS_REG(R300_RB3D_AARESOLVE_CTL, aa->aaresolve_ctl);
+    END_CS;
+}
+
+void r300_emit_fb_state(struct r300_context* r300, unsigned size, void* state)
+{
+    struct pipe_framebuffer_state* fb = (struct pipe_framebuffer_state*)state;
+    struct r300_surface* surf;
+    unsigned i;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(size);
+
+    /* NUM_MULTIWRITES replicates COLOR[0] to all colorbuffers, which is not
+     * what we usually want. */
+    if (r300->screen->caps.is_r500) {
+        OUT_CS_REG(R300_RB3D_CCTL,
+            R300_RB3D_CCTL_INDEPENDENT_COLORFORMAT_ENABLE_ENABLE);
+    } else {
+        OUT_CS_REG(R300_RB3D_CCTL, 0);
+    }
+
+    /* Set up colorbuffers. */
+    for (i = 0; i < fb->nr_cbufs; i++) {
+        surf = r300_surface(fb->cbufs[i]);
+
+        OUT_CS_REG_SEQ(R300_RB3D_COLOROFFSET0 + (4 * i), 1);
+        OUT_CS_RELOC(surf->buffer, surf->offset, 0, surf->domain, 0);
+
+        OUT_CS_REG_SEQ(R300_RB3D_COLORPITCH0 + (4 * i), 1);
+        OUT_CS_RELOC(surf->buffer, surf->pitch, 0, surf->domain, 0);
+    }
+
+    /* Set up a zbuffer. */
+    if (fb->zsbuf) {
+        surf = r300_surface(fb->zsbuf);
+
+        OUT_CS_REG(R300_ZB_FORMAT, surf->format);
+        OUT_CS_REG(R300_ZB_BW_CNTL, 0);
+
+        OUT_CS_REG_SEQ(R300_ZB_DEPTHOFFSET, 1);
+        OUT_CS_RELOC(surf->buffer, surf->offset, 0, surf->domain, 0);
+
+        OUT_CS_REG_SEQ(R300_ZB_DEPTHPITCH, 1);
+        OUT_CS_RELOC(surf->buffer, surf->pitch, 0, surf->domain, 0);
+
+        OUT_CS_REG(R300_ZB_DEPTHCLEARVALUE, 0);
+
+        /* HiZ RAM. */
+        if (r300->screen->caps.has_hiz) {
+            OUT_CS_REG(R300_ZB_HIZ_OFFSET, 0);
+            OUT_CS_REG(R300_ZB_HIZ_PITCH, 0);
+        }
+
+        /* Z Mask RAM. (compressed zbuffer) */
+        OUT_CS_REG(R300_ZB_ZMASK_OFFSET, 0);
+        OUT_CS_REG(R300_ZB_ZMASK_PITCH, 0);
+    }
+
+    /* Colorbuffer format in the US block.
+     * (must be written after unpipelined regs) */
+    OUT_CS_REG_SEQ(R300_US_OUT_FMT_0, 4);
+    for (i = 0; i < fb->nr_cbufs; i++) {
+        OUT_CS(r300_surface(fb->cbufs[i])->format);
+    }
+    for (; i < 4; i++) {
+        OUT_CS(R300_US_OUT_FMT_UNUSED);
+    }
     END_CS;
 }
 
@@ -544,13 +399,14 @@
     OUT_CS_REG(R300_ZB_ZPASS_DATA, 0);
     END_CS;
     query->begin_emitted = TRUE;
+    query->flushed = FALSE;
 }
 
-
 static void r300_emit_query_end_frag_pipes(struct r300_context *r300,
                                            struct r300_query *query)
 {
     struct r300_capabilities* caps = &r300->screen->caps;
+    struct r300_winsys_buffer *buf = r300->query_current->buffer;
     CS_LOCALS(r300);
 
     assert(caps->num_frag_pipes);
@@ -569,28 +425,28 @@
             /* pipe 3 only */
             OUT_CS_REG(R300_SU_REG_DEST, 1 << 3);
             OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
-            OUT_CS_BUF_RELOC(r300->oqbo, query->offset + (sizeof(uint32_t) * 3),
-                    0, r300_buffer(r300->oqbo)->domain, 0);
+            OUT_CS_RELOC(buf, (query->num_results + 3) * 4,
+                    0, query->domain, 0);
         case 3:
             /* pipe 2 only */
             OUT_CS_REG(R300_SU_REG_DEST, 1 << 2);
             OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
-            OUT_CS_BUF_RELOC(r300->oqbo, query->offset + (sizeof(uint32_t) * 2),
-                    0, r300_buffer(r300->oqbo)->domain, 0);
+            OUT_CS_RELOC(buf, (query->num_results + 2) * 4,
+                    0, query->domain, 0);
         case 2:
             /* pipe 1 only */
             /* As mentioned above, accomodate RV380 and older. */
             OUT_CS_REG(R300_SU_REG_DEST,
                     1 << (caps->high_second_pipe ? 3 : 1));
             OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
-            OUT_CS_BUF_RELOC(r300->oqbo, query->offset + (sizeof(uint32_t) * 1),
-                    0, r300_buffer(r300->oqbo)->domain, 0);
+            OUT_CS_RELOC(buf, (query->num_results + 1) * 4,
+                    0, query->domain, 0);
         case 1:
             /* pipe 0 only */
             OUT_CS_REG(R300_SU_REG_DEST, 1 << 0);
             OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
-            OUT_CS_BUF_RELOC(r300->oqbo, query->offset + (sizeof(uint32_t) * 0),
-                    0, r300_buffer(r300->oqbo)->domain, 0);
+            OUT_CS_RELOC(buf, (query->num_results + 0) * 4,
+                    0, query->domain, 0);
             break;
         default:
             fprintf(stderr, "r300: Implementation error: Chipset reports %d"
@@ -606,12 +462,13 @@
 static void rv530_emit_query_end_single_z(struct r300_context *r300,
                                           struct r300_query *query)
 {
+    struct r300_winsys_buffer *buf = r300->query_current->buffer;
     CS_LOCALS(r300);
 
     BEGIN_CS(8);
     OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_0);
     OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
-    OUT_CS_BUF_RELOC(r300->oqbo, query->offset, 0, r300_buffer(r300->oqbo)->domain, 0);
+    OUT_CS_RELOC(buf, query->num_results * 4, 0, query->domain, 0);
     OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL);
     END_CS;
 }
@@ -619,15 +476,16 @@
 static void rv530_emit_query_end_double_z(struct r300_context *r300,
                                           struct r300_query *query)
 {
+    struct r300_winsys_buffer *buf = r300->query_current->buffer;
     CS_LOCALS(r300);
 
     BEGIN_CS(14);
     OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_0);
     OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
-    OUT_CS_BUF_RELOC(r300->oqbo, query->offset, 0, r300_buffer(r300->oqbo)->domain, 0);
+    OUT_CS_RELOC(buf, (query->num_results + 0) * 4, 0, query->domain, 0);
     OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_1);
     OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
-    OUT_CS_BUF_RELOC(r300->oqbo, query->offset + sizeof(uint32_t), 0, r300_buffer(r300->oqbo)->domain, 0);
+    OUT_CS_RELOC(buf, (query->num_results + 1) * 4, 0, query->domain, 0);
     OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL);
     END_CS;
 }
@@ -652,18 +510,63 @@
         r300_emit_query_end_frag_pipes(r300, query);
 
     query->begin_emitted = FALSE;
+    query->num_results += query->num_pipes;
+
+    /* XXX grab all the results and reset the counter. */
+    if (query->num_results >= query->buffer_size / 4 - 4) {
+        query->num_results = (query->buffer_size / 4) / 2;
+        fprintf(stderr, "r300: Rewinding OQBO...\n");
+    }
 }
 
 void r300_emit_rs_state(struct r300_context* r300, unsigned size, void* state)
 {
-    struct r300_rs_state* rs = (struct r300_rs_state*)state;
+    struct r300_rs_state* rs = state;
+    struct pipe_framebuffer_state* fb = r300->fb_state.state;
     float scale, offset;
+    unsigned mspos0, mspos1;
     CS_LOCALS(r300);
 
     BEGIN_CS(size);
     OUT_CS_REG(R300_VAP_CNTL_STATUS, rs->vap_control_status);
 
-    OUT_CS_REG(R300_GB_AA_CONFIG, rs->antialiasing_config);
+    /* Multisampling. Depends on framebuffer sample count. */
+    if (r300->rws->get_value(r300->rws, R300_VID_DRM_2_3_0)) {
+        if (fb->nr_cbufs && fb->cbufs[0]->texture->nr_samples > 1) {
+            /* Subsample placement. These may not be optimal. */
+            switch (fb->cbufs[0]->texture->nr_samples) {
+                case 2:
+                    mspos0 = 0x33996633;
+                    mspos1 = 0x6666663;
+                    break;
+                case 3:
+                    mspos0 = 0x33936933;
+                    mspos1 = 0x6666663;
+                    break;
+                case 4:
+                    mspos0 = 0x33939933;
+                    mspos1 = 0x3966663;
+                    break;
+                case 6:
+                    mspos0 = 0x22a2aa22;
+                    mspos1 = 0x2a65672;
+                    break;
+                default:
+                    debug_printf("r300: Bad number of multisamples!\n");
+                    mspos0 = rs->multisample_position_0;
+                    mspos1 = rs->multisample_position_1;
+                    break;
+            }
+
+            OUT_CS_REG_SEQ(R300_GB_MSPOS0, 2);
+            OUT_CS(mspos0);
+            OUT_CS(mspos1);
+        } else {
+            OUT_CS_REG_SEQ(R300_GB_MSPOS0, 2);
+            OUT_CS(rs->multisample_position_0);
+            OUT_CS(rs->multisample_position_1);
+        }
+    }
 
     OUT_CS_REG(R300_GA_POINT_SIZE, rs->point_size);
     OUT_CS_REG_SEQ(R300_GA_POINT_MINMAX, 2);
@@ -940,6 +843,17 @@
     CS_LOCALS(r300);
 
     BEGIN_CS(size);
+    /* Amount of time to wait for vertex fetches in PVS */
+    OUT_CS_REG(VAP_PVS_VTX_TIMEOUT_REG, 0xffff);
+
+    OUT_CS_REG_SEQ(R300_VAP_GB_VERT_CLIP_ADJ, 4);
+    OUT_CS_32F(1.0);
+    OUT_CS_32F(1.0);
+    OUT_CS_32F(1.0);
+    OUT_CS_32F(1.0);
+
+    OUT_CS_REG(R300_VAP_PSC_SGN_NORM_CNTL, R300_SGN_NORM_NO_ZERO);
+
     /* R300_VAP_PVS_CODE_CNTL_0
      * R300_VAP_PVS_CONST_CNTL
      * R300_VAP_PVS_CODE_CNTL_1
@@ -1004,12 +918,7 @@
 
     BEGIN_CS(size);
     OUT_CS_REG_SEQ(R300_SE_VPORT_XSCALE, 6);
-    OUT_CS_32F(viewport->xscale);
-    OUT_CS_32F(viewport->xoffset);
-    OUT_CS_32F(viewport->yscale);
-    OUT_CS_32F(viewport->yoffset);
-    OUT_CS_32F(viewport->zscale);
-    OUT_CS_32F(viewport->zoffset);
+    OUT_CS_TABLE(&viewport->xscale, 6);
     OUT_CS_REG(R300_VAP_VTE_CNTL, viewport->vte_control);
     END_CS;
 }
@@ -1091,10 +1000,9 @@
         }
     }
     /* ...occlusion query buffer... */
-    if (r300->query_start.dirty ||
-        (r300->query_current && r300->query_current->begin_emitted)) {
-        if (!r300_add_buffer(r300->rws, r300->oqbo,
-			     0, r300_buffer(r300->oqbo)->domain)) {
+    if (r300->query_current) {
+        if (!r300->rws->add_buffer(r300->rws, r300->query_current->buffer,
+                                   0, r300->query_current->domain)) {
             r300->context.flush(&r300->context, 0, NULL);
             goto validate;
         }
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
index 9066088..0d4e1f7 100644
--- a/src/gallium/drivers/r300/r300_emit.h
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -29,6 +29,8 @@
 struct rX00_fragment_program_code;
 struct r300_vertex_program_code;
 
+uint32_t pack_float24(float f);
+
 void r300_emit_aos(struct r300_context* r300, int offset, boolean indexed);
 
 void r300_emit_blend_state(struct r300_context* r300,
@@ -43,16 +45,12 @@
 void r300_emit_dsa_state(struct r300_context* r300,
                          unsigned size, void* state);
 
-unsigned r300_get_fs_atom_size(struct r300_context *r300);
-
 void r300_emit_fs(struct r300_context* r300, unsigned size, void *state);
 
 void r300_emit_fs_constants(struct r300_context* r300, unsigned size, void *state);
 
 void r300_emit_fs_rc_constant_state(struct r300_context* r300, unsigned size, void *state);
 
-unsigned r500_get_fs_atom_size(struct r300_context *r300);
-
 void r500_emit_fs(struct r300_context* r300, unsigned size, void *state);
 
 void r500_emit_fs_constants(struct r300_context* r300, unsigned size, void *state);
@@ -61,6 +59,10 @@
 
 void r300_emit_fb_state(struct r300_context* r300, unsigned size, void* state);
 
+void r300_emit_gpu_flush(struct r300_context *r300, unsigned size, void *state);
+
+void r300_emit_aa_state(struct r300_context *r300, unsigned size, void *state);
+
 void r300_emit_query_start(struct r300_context *r300, unsigned size, void *state);
 
 void r300_emit_query_end(struct r300_context* r300);
diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
index 360b19a..ba840bf 100644
--- a/src/gallium/drivers/r300/r300_flush.c
+++ b/src/gallium/drivers/r300/r300_flush.c
@@ -39,8 +39,6 @@
     struct r300_atom *atom;
     struct r300_fence **rfence = (struct r300_fence**)fence;
 
-    CS_LOCALS(r300);
-    (void) cs_count;
     /* We probably need to flush Draw, but we may have been called from
      * within Draw. This feels kludgy, but it might be the best thing.
      *
@@ -52,7 +50,10 @@
     if (r300->dirty_hw) {
         r300_emit_query_end(r300);
 
-        FLUSH_CS;
+        if (SCREEN_DBG_ON(r300->screen, DBG_STATS)) {
+            r300->flush_counter++;
+        }
+        r300->rws->flush_cs(r300->rws);
         r300->dirty_hw = 0;
 
         /* New kitchen sink, baby. */
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index a434808..424f831 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -28,7 +28,9 @@
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_ureg.h"
 
+#include "r300_cb.h"
 #include "r300_context.h"
+#include "r300_emit.h"
 #include "r300_screen.h"
 #include "r300_fs.h"
 #include "r300_reg.h"
@@ -229,6 +231,122 @@
     ureg_destroy(ureg);
 }
 
+static void r300_emit_fs_code_to_buffer(
+    struct r300_context *r300,
+    struct r300_fragment_shader_code *shader)
+{
+    struct rX00_fragment_program_code *generic_code = &shader->code;
+    unsigned imm_count = shader->immediates_count;
+    unsigned imm_first = shader->externals_count;
+    unsigned imm_end = generic_code->constants.Count;
+    struct rc_constant *constants = generic_code->constants.Constants;
+    unsigned i;
+    CB_LOCALS;
+
+    if (r300->screen->caps.is_r500) {
+        struct r500_fragment_program_code *code = &generic_code->code.r500;
+
+        shader->cb_code_size = 17 +
+                               ((code->inst_end + 1) * 6) +
+                               imm_count * 7;
+
+        NEW_CB(shader->cb_code, shader->cb_code_size);
+        OUT_CB_REG(R500_US_CONFIG, R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
+        OUT_CB_REG(R500_US_PIXSIZE, code->max_temp_idx);
+        OUT_CB_REG(R500_US_CODE_RANGE,
+                   R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(code->inst_end));
+        OUT_CB_REG(R500_US_CODE_OFFSET, 0);
+        OUT_CB_REG(R500_US_CODE_ADDR,
+                   R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(code->inst_end));
+
+        OUT_CB_REG(R500_GA_US_VECTOR_INDEX, R500_GA_US_VECTOR_INDEX_TYPE_INSTR);
+        OUT_CB_ONE_REG(R500_GA_US_VECTOR_DATA, (code->inst_end + 1) * 6);
+        for (i = 0; i <= code->inst_end; i++) {
+            OUT_CB(code->inst[i].inst0);
+            OUT_CB(code->inst[i].inst1);
+            OUT_CB(code->inst[i].inst2);
+            OUT_CB(code->inst[i].inst3);
+            OUT_CB(code->inst[i].inst4);
+            OUT_CB(code->inst[i].inst5);
+        }
+
+        /* Emit immediates. */
+        if (imm_count) {
+            for(i = imm_first; i < imm_end; ++i) {
+                if (constants[i].Type == RC_CONSTANT_IMMEDIATE) {
+                    const float *data = constants[i].u.Immediate;
+
+                    OUT_CB_REG(R500_GA_US_VECTOR_INDEX,
+                               R500_GA_US_VECTOR_INDEX_TYPE_CONST |
+                               (i & R500_GA_US_VECTOR_INDEX_MASK));
+                    OUT_CB_ONE_REG(R500_GA_US_VECTOR_DATA, 4);
+                    OUT_CB_TABLE(data, 4);
+                }
+            }
+        }
+    } else { /* r300 */
+        struct r300_fragment_program_code *code = &generic_code->code.r300;
+
+        shader->cb_code_size = 19 +
+                               (r300->screen->caps.is_r400 ? 2 : 0) +
+                               code->alu.length * 4 +
+                               (code->tex.length ? (1 + code->tex.length) : 0) +
+                               imm_count * 5;
+
+        NEW_CB(shader->cb_code, shader->cb_code_size);
+
+        if (r300->screen->caps.is_r400)
+            OUT_CB_REG(R400_US_CODE_BANK, 0);
+
+        OUT_CB_REG(R300_US_CONFIG, code->config);
+        OUT_CB_REG(R300_US_PIXSIZE, code->pixsize);
+        OUT_CB_REG(R300_US_CODE_OFFSET, code->code_offset);
+
+        OUT_CB_REG_SEQ(R300_US_CODE_ADDR_0, 4);
+        OUT_CB_TABLE(code->code_addr, 4);
+
+        OUT_CB_REG_SEQ(R300_US_ALU_RGB_INST_0, code->alu.length);
+        for (i = 0; i < code->alu.length; i++)
+            OUT_CB(code->alu.inst[i].rgb_inst);
+
+        OUT_CB_REG_SEQ(R300_US_ALU_RGB_ADDR_0, code->alu.length);
+        for (i = 0; i < code->alu.length; i++)
+            OUT_CB(code->alu.inst[i].rgb_addr);
+
+        OUT_CB_REG_SEQ(R300_US_ALU_ALPHA_INST_0, code->alu.length);
+        for (i = 0; i < code->alu.length; i++)
+            OUT_CB(code->alu.inst[i].alpha_inst);
+
+        OUT_CB_REG_SEQ(R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
+        for (i = 0; i < code->alu.length; i++)
+            OUT_CB(code->alu.inst[i].alpha_addr);
+
+        if (code->tex.length) {
+            OUT_CB_REG_SEQ(R300_US_TEX_INST_0, code->tex.length);
+            OUT_CB_TABLE(code->tex.inst, code->tex.length);
+        }
+
+        /* Emit immediates. */
+        if (imm_count) {
+            for(i = imm_first; i < imm_end; ++i) {
+                if (constants[i].Type == RC_CONSTANT_IMMEDIATE) {
+                    const float *data = constants[i].u.Immediate;
+
+                    OUT_CB_REG_SEQ(R300_PFS_PARAM_0_X + i * 16, 4);
+                    OUT_CB(pack_float24(data[0]));
+                    OUT_CB(pack_float24(data[1]));
+                    OUT_CB(pack_float24(data[2]));
+                    OUT_CB(pack_float24(data[3]));
+                }
+            }
+        }
+    }
+
+    OUT_CB_REG(R300_FG_DEPTH_SRC, shader->fg_depth_src);
+    OUT_CB_REG(R300_US_W_FMT, shader->us_out_w);
+    END_CB;
+}
+
 static void r300_translate_fragment_shader(
     struct r300_context* r300,
     struct r300_fragment_shader_code* shader,
@@ -338,6 +456,9 @@
 
     /* And, finally... */
     rc_destroy(&compiler.Base);
+
+    /* Build the command buffer. */
+    r300_emit_fs_code_to_buffer(r300, shader);
 }
 
 boolean r300_pick_fragment_shader(struct r300_context* r300)
diff --git a/src/gallium/drivers/r300/r300_fs.h b/src/gallium/drivers/r300/r300_fs.h
index 1cc4355..51bfa88 100644
--- a/src/gallium/drivers/r300/r300_fs.h
+++ b/src/gallium/drivers/r300/r300_fs.h
@@ -50,6 +50,9 @@
     struct r300_fragment_program_external_state compare_state;
     struct rX00_fragment_program_code code;
 
+    unsigned cb_code_size;
+    uint32_t *cb_code;
+
     struct r300_fragment_shader_code* next;
 };
 
diff --git a/src/gallium/drivers/r300/r300_hyperz.c b/src/gallium/drivers/r300/r300_hyperz.c
index b41b6b1..e5c7658 100644
--- a/src/gallium/drivers/r300/r300_hyperz.c
+++ b/src/gallium/drivers/r300/r300_hyperz.c
@@ -31,33 +31,46 @@
 /* The ZTOP state                                                            */
 /*****************************************************************************/
 
-static boolean r300_dsa_writes_depth_stencil(struct r300_dsa_state* dsa)
+static boolean r300_dsa_writes_stencil(
+        struct pipe_stencil_state *s)
 {
-    /* We are interested only in the cases when a new depth or stencil value
-     * can be written and changed. */
-
-    /* We might optionally check for [Z func: never] and inspect the stencil
-     * state in a similar fashion, but it's not terribly important. */
-    return (dsa->z_buffer_control & R300_Z_WRITE_ENABLE) ||
-           (dsa->stencil_ref_mask & R300_STENCILWRITEMASK_MASK) ||
-           ((dsa->z_buffer_control & R500_STENCIL_REFMASK_FRONT_BACK) &&
-            (dsa->stencil_ref_bf & R300_STENCILWRITEMASK_MASK));
+    return s->enabled && s->writemask &&
+           (s->fail_op  != PIPE_STENCIL_OP_KEEP ||
+            s->zfail_op != PIPE_STENCIL_OP_KEEP ||
+            s->zpass_op != PIPE_STENCIL_OP_KEEP);
 }
 
-static boolean r300_dsa_alpha_test_enabled(struct r300_dsa_state* dsa)
+static boolean r300_dsa_writes_depth_stencil(
+        struct pipe_depth_stencil_alpha_state *dsa)
+{
+    /* We are interested only in the cases when a depth or stencil value
+     * can be changed. */
+
+    if (dsa->depth.enabled && dsa->depth.writemask &&
+        dsa->depth.func != PIPE_FUNC_NEVER)
+        return TRUE;
+
+    if (r300_dsa_writes_stencil(&dsa->stencil[0]) ||
+        r300_dsa_writes_stencil(&dsa->stencil[1]))
+        return TRUE;
+
+    return FALSE;
+}
+
+static boolean r300_dsa_alpha_test_enabled(
+        struct pipe_depth_stencil_alpha_state *dsa)
 {
     /* We are interested only in the cases when alpha testing can kill
      * a fragment. */
-    uint32_t af = dsa->alpha_function;
 
-    return (af & R300_FG_ALPHA_FUNC_ENABLE) &&
-           (af & R300_FG_ALPHA_FUNC_ALWAYS) != R300_FG_ALPHA_FUNC_ALWAYS;
+    return dsa->alpha.enabled && dsa->alpha.func != PIPE_FUNC_ALWAYS;
 }
 
 static void r300_update_ztop(struct r300_context* r300)
 {
     struct r300_ztop_state* ztop_state =
         (struct r300_ztop_state*)r300->ztop_state.state;
+    uint32_t old_ztop = ztop_state->z_buffer_top;
 
     /* This is important enough that I felt it warranted a comment.
      *
@@ -99,7 +112,8 @@
         ztop_state->z_buffer_top = R300_ZTOP_ENABLE;
     }
 
-    r300->ztop_state.dirty = TRUE;
+    if (ztop_state->z_buffer_top != old_ztop)
+        r300->ztop_state.dirty = TRUE;
 }
 
 void r300_update_hyperz_state(struct r300_context* r300)
diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c
index 7c08806..10086ee 100644
--- a/src/gallium/drivers/r300/r300_query.c
+++ b/src/gallium/drivers/r300/r300_query.c
@@ -26,6 +26,7 @@
 #include "r300_context.h"
 #include "r300_screen.h"
 #include "r300_emit.h"
+#include "r300_winsys.h"
 
 #include <stdio.h>
 
@@ -34,30 +35,29 @@
 {
     struct r300_context *r300 = r300_context(pipe);
     struct r300_screen *r300screen = r300->screen;
-    unsigned query_size;
-    struct r300_query *q, *qptr;
+    struct r300_query *q;
+
+    if (query_type != PIPE_QUERY_OCCLUSION_COUNTER) {
+        return NULL;
+    }
 
     q = CALLOC_STRUCT(r300_query);
+    if (!q)
+        return NULL;
 
     q->type = query_type;
-    assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+    q->domain = R300_DOMAIN_GTT;
+    q->buffer_size = 4096;
 
     if (r300screen->caps.family == CHIP_FAMILY_RV530)
-        query_size = r300screen->caps.num_z_pipes * sizeof(uint32_t);
+        q->num_pipes = r300screen->caps.num_z_pipes;
     else
-        query_size = r300screen->caps.num_frag_pipes * sizeof(uint32_t);
+        q->num_pipes = r300screen->caps.num_frag_pipes;
 
-    if (!is_empty_list(&r300->query_list)) {
-        qptr = last_elem(&r300->query_list);
-        q->offset = qptr->offset + query_size;
-    }
     insert_at_tail(&r300->query_list, q);
 
-    /* XXX */
-    if (q->offset >= 4096) {
-        q->offset = 0;
-        fprintf(stderr, "r300: Rewinding OQBO...\n");
-    }
+    /* Open up the occlusion query buffer. */
+    q->buffer = r300->rws->buffer_create(r300->rws, 4096, 0, q->domain, q->buffer_size);
 
     return (struct pipe_query*)q;
 }
@@ -65,18 +65,26 @@
 static void r300_destroy_query(struct pipe_context* pipe,
                                struct pipe_query* query)
 {
-    struct r300_query* q = (struct r300_query*)query;
+    struct r300_context *r300 = r300_context(pipe);
+    struct r300_query* q = r300_query(query);
 
+    r300->rws->buffer_reference(r300->rws, &q->buffer, NULL);
     remove_from_list(q);
     FREE(query);
 }
 
+void r300_resume_query(struct r300_context *r300,
+                       struct r300_query *query)
+{
+    r300->query_current = query;
+    r300->query_start.dirty = TRUE;
+}
+
 static void r300_begin_query(struct pipe_context* pipe,
                              struct pipe_query* query)
 {
-    uint32_t value = ~0U;
     struct r300_context* r300 = r300_context(pipe);
-    struct r300_query* q = (struct r300_query*)query;
+    struct r300_query* q = r300_query(query);
 
     if (r300->query_current != NULL) {
         fprintf(stderr, "r300: begin_query: "
@@ -85,30 +93,29 @@
         return;
     }
 
-    pipe_buffer_write(pipe,
-		      r300->oqbo,
-		      q->offset,
-		      sizeof value,
-		      &value);
+    q->num_results = 0;
+    r300_resume_query(r300, q);
+}
 
-    q->flushed = FALSE;
-    r300->query_current = q;
-    r300->query_start.dirty = TRUE;
+void r300_stop_query(struct r300_context *r300)
+{
+    r300_emit_query_end(r300);
+    r300->query_current = NULL;
 }
 
 static void r300_end_query(struct pipe_context* pipe,
 	                   struct pipe_query* query)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_query *q = r300_query(query);
 
-    if ((struct r300_query*)query != r300->query_current) {
+    if (q != r300->query_current) {
         fprintf(stderr, "r300: end_query: Got invalid query.\n");
         assert(0);
         return;
     }
 
-    r300_emit_query_end(r300);
-    r300->query_current = NULL;
+    r300_stop_query(r300);
 }
 
 static boolean r300_get_query_result(struct pipe_context* pipe,
@@ -117,54 +124,28 @@
                                      void* vresult)
 {
     struct r300_context* r300 = r300_context(pipe);
-    struct r300_screen* r300screen = r300->screen;
-    struct r300_query *q = (struct r300_query*)query;
-    struct pipe_transfer *transfer;
-    unsigned flags = PIPE_TRANSFER_READ;
-    uint32_t* map;
-    uint32_t temp = 0;
-    unsigned i, num_results;
+    struct r300_query *q = r300_query(query);
+    unsigned flags, i;
+    uint32_t temp, *map;
     uint64_t *result = (uint64_t*)vresult;
 
-    if (q->flushed == FALSE)
+    if (!q->flushed)
         pipe->flush(pipe, 0, NULL);
-    if (!wait) {
-        flags |= PIPE_TRANSFER_DONTBLOCK;
-    }
 
-    map = pipe_buffer_map(pipe, r300->oqbo, flags, &transfer);
+    flags = PIPE_TRANSFER_READ | (!wait ? PIPE_TRANSFER_DONTBLOCK : 0);
+
+    map = r300->rws->buffer_map(r300->rws, q->buffer, flags);
     if (!map)
         return FALSE;
-    map += q->offset / 4;
 
-    if (r300screen->caps.family == CHIP_FAMILY_RV530)
-        num_results = r300screen->caps.num_z_pipes;
-    else
-        num_results = r300screen->caps.num_frag_pipes;
-
-    for (i = 0; i < num_results; i++) {
-        if (*map == ~0U) {
-            /* Looks like our results aren't ready yet. */
-            if (wait) {
-                fprintf(stderr, "r300: Despite waiting, OQ results haven't "
-                                "come in yet. This is a driver bug.\n"
-                                "r300: Returning bogus results to avoid "
-                                "a possible infinite loop...\n");
-                temp = 987654321;
-            } else {
-                temp = ~0U;
-            }
-            break;
-        }
+    /* Sum up the results. */
+    temp = 0;
+    for (i = 0; i < q->num_results; i++) {
         temp += *map;
         map++;
     }
-    pipe_buffer_unmap(pipe, r300->oqbo, transfer);
 
-    if (temp == ~0U) {
-        /* Our results haven't been written yet... */
-        return FALSE;
-    }
+    r300->rws->buffer_unmap(r300->rws, q->buffer);
 
     *result = temp;
     return TRUE;
@@ -192,11 +173,61 @@
     }
 }
 
+/***************************************************************************
+ * Fake occlusion queries (for debugging)
+ ***************************************************************************/
+
+static unsigned r300_fake_query;
+
+static struct pipe_query *r300_fake_create_query(struct pipe_context *pipe,
+                                                 unsigned query_type)
+{
+    return (struct pipe_query*)&r300_fake_query;
+}
+
+static void r300_fake_destroy_query(struct pipe_context* pipe,
+                                    struct pipe_query* query)
+{
+}
+
+static void r300_fake_begin_query(struct pipe_context* pipe,
+                                  struct pipe_query* query)
+{
+}
+
+static void r300_fake_end_query(struct pipe_context* pipe,
+                                struct pipe_query* query)
+{
+}
+
+static boolean r300_fake_get_query_result(struct pipe_context* pipe,
+                                          struct pipe_query* query,
+                                          boolean wait, void* vresult)
+{
+    uint64_t *result = (uint64_t*)vresult;
+    *result = 1000000;
+    return TRUE;
+}
+
+static void r300_fake_render_condition(struct pipe_context *pipe,
+                                       struct pipe_query *query, uint mode)
+{
+}
+
 void r300_init_query_functions(struct r300_context* r300) {
-    r300->context.create_query = r300_create_query;
-    r300->context.destroy_query = r300_destroy_query;
-    r300->context.begin_query = r300_begin_query;
-    r300->context.end_query = r300_end_query;
-    r300->context.get_query_result = r300_get_query_result;
-    r300->context.render_condition = r300_render_condition;
+    if (DBG_ON(r300, DBG_FAKE_OCC)) {
+        r300->context.create_query = r300_fake_create_query;
+        r300->context.destroy_query = r300_fake_destroy_query;
+        r300->context.begin_query = r300_fake_begin_query;
+        r300->context.end_query = r300_fake_end_query;
+        r300->context.get_query_result = r300_fake_get_query_result;
+        r300->context.render_condition = r300_fake_render_condition;
+    } else {
+        r300->context.create_query = r300_create_query;
+        r300->context.destroy_query = r300_destroy_query;
+        r300->context.begin_query = r300_begin_query;
+        r300->context.end_query = r300_end_query;
+        r300->context.get_query_result = r300_get_query_result;
+        r300->context.render_condition = r300_render_condition;
+    }
 }
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index c4fa19a..1805601 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -1630,6 +1630,40 @@
 #       define R300_TX_FORMAT_GAMMA               (1 << 21)
 #       define R300_TX_FORMAT_YUV_TO_RGB          (1 << 22)
 
+#       define R300_TX_CACHE(x)                 ((x) << 27)
+#       define R300_TX_CACHE_WHOLE              0
+/* reserved */
+#       define R300_TX_CACHE_HALF_0             2
+#       define R300_TX_CACHE_HALF_1             3
+#       define R300_TX_CACHE_FOURTH_0           4
+#       define R300_TX_CACHE_FOURTH_1           5
+#       define R300_TX_CACHE_FOURTH_2           6
+#       define R300_TX_CACHE_FOURTH_3           7
+#       define R300_TX_CACHE_EIGHTH_0           8
+#       define R300_TX_CACHE_EIGHTH_1           9
+#       define R300_TX_CACHE_EIGHTH_2           10
+#       define R300_TX_CACHE_EIGHTH_3           11
+#       define R300_TX_CACHE_EIGHTH_4           12
+#       define R300_TX_CACHE_EIGHTH_5           13
+#       define R300_TX_CACHE_EIGHTH_6           14
+#       define R300_TX_CACHE_EIGHTH_7           15
+#       define R300_TX_CACHE_SIXTEENTH_0        16
+#       define R300_TX_CACHE_SIXTEENTH_1        17
+#       define R300_TX_CACHE_SIXTEENTH_2        18
+#       define R300_TX_CACHE_SIXTEENTH_3        19
+#       define R300_TX_CACHE_SIXTEENTH_4        20
+#       define R300_TX_CACHE_SIXTEENTH_5        21
+#       define R300_TX_CACHE_SIXTEENTH_6        22
+#       define R300_TX_CACHE_SIXTEENTH_7        23
+#       define R300_TX_CACHE_SIXTEENTH_8        24
+#       define R300_TX_CACHE_SIXTEENTH_9        25
+#       define R300_TX_CACHE_SIXTEENTH_10       26
+#       define R300_TX_CACHE_SIXTEENTH_11       27
+#       define R300_TX_CACHE_SIXTEENTH_12       28
+#       define R300_TX_CACHE_SIXTEENTH_13       29
+#       define R300_TX_CACHE_SIXTEENTH_14       30
+#       define R300_TX_CACHE_SIXTEENTH_15       31
+
 #define R300_TX_FORMAT2_0		    0x4500 /* obvious missing in gap */
 #       define R300_TX_PITCHMASK_SHIFT           0
 #       define R300_TX_PITCHMASK_MASK            (2047 << 0)
@@ -2639,6 +2673,24 @@
 /* Z Buffer Clear Value */
 #define R300_ZB_DEPTHCLEARVALUE                  0x4f28
 
+/* Z Mask RAM is a Z compression buffer.
+ * Each dword of the Z Mask contains compression info for 16 4x4 pixel blocks,
+ * that is 2 bits for each block.
+ * On chips with 2 Z pipes, every other dword maps to a different pipe.
+ */
+
+/* The dword offset into Z mask RAM (bits 18:4) */
+#define R300_ZB_ZMASK_OFFSET                     0x4f30
+
+/* Z Mask Pitch. */
+#define R300_ZB_ZMASK_PITCH                      0x4f34
+
+/* Access to Z Mask RAM in a manner similar to HiZ RAM.
+ * The indices are autoincrementing. */
+#define R300_ZB_ZMASK_WRINDEX                    0x4f38
+#define R300_ZB_ZMASK_DWORD                      0x4f3c
+#define R300_ZB_ZMASK_RDINDEX                    0x4f40
+
 /* Hierarchical Z Memory Offset */
 #define R300_ZB_HIZ_OFFSET                       0x4f44
 
@@ -3437,9 +3489,18 @@
 #       define RADEON_WAIT_3D_IDLECLEAN     (1 << 17)
 #       define RADEON_WAIT_HOST_IDLECLEAN   (1 << 18)
 
+#define R200_3D_DRAW_IMMD_2      0xC0003500
+
+#define RADEON_CP_PACKET0 0x0 /* XXX stolen from radeon_reg.h */
 #define RADEON_CP_PACKET3                           0xC0000000
 
-#define R200_3D_DRAW_IMMD_2      0xC0003500
+#define RADEON_ONE_REG_WR        (1 << 15)
+
+#define CP_PACKET0(register, count) \
+    (RADEON_CP_PACKET0 | ((count) << 16) | ((register) >> 2))
+
+#define CP_PACKET3(op, count) \
+    (RADEON_CP_PACKET3 | (op) | ((count) << 16))
 
 #endif /* _R300_REG_H */
 
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 80dea8b..99ad162 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -35,6 +35,7 @@
 #include "util/u_prim.h"
 
 #include "r300_cs.h"
+#include "r300_cb.h"
 #include "r300_context.h"
 #include "r300_screen_buffer.h"
 #include "r300_emit.h"
@@ -43,6 +44,8 @@
 
 #include <limits.h>
 
+#define IMMD_DWORDS 32
+
 static uint32_t r300_translate_primitive(unsigned prim)
 {
     switch (prim) {
@@ -269,7 +272,7 @@
         return FALSE;
     }
 
-    if (count > 10) {
+    if (count * r300->velems->vertex_size_dwords > IMMD_DWORDS) {
         return FALSE;
     }
 
@@ -308,10 +311,10 @@
     struct pipe_vertex_element* velem;
     struct pipe_vertex_buffer* vbuf;
     unsigned vertex_element_count = r300->velems->count;
-    unsigned i, v, vbi, dw, elem_offset, dwords;
+    unsigned i, v, vbi, dwords;
 
     /* Size of the vertex, in dwords. */
-    unsigned vertex_size = 0;
+    unsigned vertex_size = r300->velems->vertex_size_dwords;
 
     /* Offsets of the attribute, in dwords, from the start of the vertex. */
     unsigned offset[PIPE_MAX_ATTRIBS];
@@ -327,14 +330,13 @@
     uint32_t* map[PIPE_MAX_ATTRIBS] = {0};
     struct pipe_transfer* transfer[PIPE_MAX_ATTRIBS] = {NULL};
 
-    CS_LOCALS(r300);
+    CB_LOCALS;
 
     /* Calculate the vertex size, offsets, strides etc. and map the buffers. */
     for (i = 0; i < vertex_element_count; i++) {
         velem = &r300->velems->velem[i];
         offset[i] = velem->src_offset / 4;
         size[i] = r300->velems->hw_format_size[i] / 4;
-        vertex_size += size[i];
         vbi = velem->vertex_buffer_index;
 
         /* Map the buffer. */
@@ -344,8 +346,8 @@
                                                   vbuf->buffer,
                                                   PIPE_TRANSFER_READ,
 						  &transfer[vbi]);
-            map[vbi] += vbuf->buffer_offset / 4;
             stride[vbi] = vbuf->stride / 4;
+            map[vbi] += vbuf->buffer_offset / 4 + stride[vbi] * start;
         }
     }
 
@@ -353,30 +355,26 @@
 
     r300_prepare_for_rendering(r300, PREP_FIRST_DRAW, NULL, dwords, 0, 0, NULL);
 
-    BEGIN_CS(dwords);
-    OUT_CS_REG(R300_GA_COLOR_CONTROL,
+    BEGIN_CS_AS_CB(r300, dwords);
+    OUT_CB_REG(R300_GA_COLOR_CONTROL,
             r300_provoking_vertex_fixes(r300, mode));
-    OUT_CS_REG(R300_VAP_VTX_SIZE, vertex_size);
-    OUT_CS_REG_SEQ(R300_VAP_VF_MAX_VTX_INDX, 2);
-    OUT_CS(count - 1);
-    OUT_CS(0);
-    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_IMMD_2, count * vertex_size);
-    OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED | (count << 16) |
+    OUT_CB_REG(R300_VAP_VTX_SIZE, vertex_size);
+    OUT_CB_REG_SEQ(R300_VAP_VF_MAX_VTX_INDX, 2);
+    OUT_CB(count - 1);
+    OUT_CB(0);
+    OUT_CB_PKT3(R300_PACKET3_3D_DRAW_IMMD_2, count * vertex_size);
+    OUT_CB(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED | (count << 16) |
             r300_translate_primitive(mode));
 
     /* Emit vertices. */
     for (v = 0; v < count; v++) {
         for (i = 0; i < vertex_element_count; i++) {
-            velem = &r300->velems->velem[i];
-            vbi = velem->vertex_buffer_index;
-            elem_offset = offset[i] + stride[vbi] * (v + start);
+            vbi = r300->velems->velem[i].vertex_buffer_index;
 
-            for (dw = 0; dw < size[i]; dw++) {
-                OUT_CS(map[vbi][elem_offset + dw]);
-            }
+            OUT_CB_TABLE(&map[vbi][offset[i] + stride[vbi] * v], size[i]);
         }
     }
-    END_CS;
+    END_CB;
 
     /* Unmap buffers. */
     for (i = 0; i < vertex_element_count; i++) {
@@ -482,111 +480,6 @@
     END_CS;
 }
 
-static void r300_shorten_ubyte_elts(struct r300_context* r300,
-                                    struct pipe_resource** elts,
-                                    int index_bias,
-                                    unsigned start,
-                                    unsigned count)
-{
-    struct pipe_context* context = &r300->context;
-    struct pipe_screen* screen = r300->context.screen;
-    struct pipe_resource* new_elts;
-    unsigned char *in_map;
-    unsigned short *out_map;
-    struct pipe_transfer *src_transfer, *dst_transfer;
-    unsigned i;
-
-    new_elts = pipe_buffer_create(screen,
-				  PIPE_BIND_INDEX_BUFFER,
-				  2 * count);
-
-    in_map = pipe_buffer_map(context, *elts, PIPE_TRANSFER_READ, &src_transfer);
-    out_map = pipe_buffer_map(context, new_elts, PIPE_TRANSFER_WRITE, &dst_transfer);
-
-    in_map += start;
-
-    for (i = 0; i < count; i++) {
-        *out_map = (unsigned short)(*in_map + index_bias);
-        in_map++;
-        out_map++;
-    }
-
-    pipe_buffer_unmap(context, *elts, src_transfer);
-    pipe_buffer_unmap(context, new_elts, dst_transfer);
-
-    *elts = new_elts;
-}
-
-static void r300_rebuild_ushort_elts(struct r300_context *r300,
-                                     struct pipe_resource **elts,
-                                     int index_bias,
-                                     unsigned start, unsigned count)
-{
-    struct pipe_context *context = &r300->context;
-    struct pipe_transfer *in_transfer = NULL;
-    struct pipe_transfer *out_transfer = NULL;
-    struct pipe_resource *new_elts;
-    unsigned short *in_map;
-    unsigned short *out_map;
-    unsigned i;
-
-    new_elts = pipe_buffer_create(context->screen,
-				  PIPE_BIND_INDEX_BUFFER,
-				  2 * count);
-
-    in_map = pipe_buffer_map(context, *elts,
-                             PIPE_TRANSFER_READ, &in_transfer);
-    out_map = pipe_buffer_map(context, new_elts,
-			      PIPE_TRANSFER_WRITE, &out_transfer);
-
-    in_map += start;
-    for (i = 0; i < count; i++) {
-        *out_map = (unsigned short)(*in_map + index_bias);
-        in_map++;
-        out_map++;
-    }
-
-    pipe_buffer_unmap(context, *elts, in_transfer);
-    pipe_buffer_unmap(context, new_elts, out_transfer);
-
-    *elts = new_elts;
-}
-
-static void r300_rebuild_uint_elts(struct r300_context *r300,
-                                   struct pipe_resource **elts,
-                                   int index_bias,
-                                   unsigned start, unsigned count)
-{
-    struct pipe_context *context = &r300->context;
-    struct pipe_transfer *in_transfer = NULL;
-    struct pipe_transfer *out_transfer = NULL;
-    struct pipe_resource *new_elts;
-    unsigned int *in_map;
-    unsigned int *out_map;
-    unsigned i;
-
-    new_elts = pipe_buffer_create(context->screen,
-                                  PIPE_BIND_INDEX_BUFFER,
-                                  2 * count);
-
-    in_map = pipe_buffer_map(context, *elts,
-                             PIPE_TRANSFER_READ, &in_transfer);
-    out_map = pipe_buffer_map(context, new_elts,
-                              PIPE_TRANSFER_WRITE, &out_transfer);
-
-    in_map += start;
-    for (i = 0; i < count; i++) {
-        *out_map = (unsigned int)(*in_map + index_bias);
-        in_map++;
-        out_map++;
-    }
-
-    pipe_buffer_unmap(context, *elts, in_transfer);
-    pipe_buffer_unmap(context, new_elts, out_transfer);
-
-    *elts = new_elts;
-}
-
 /* This is the fast-path drawing & emission for HW TCL. */
 static void r300_draw_range_elements(struct pipe_context* pipe,
                                      struct pipe_resource* indexBuffer,
@@ -605,46 +498,34 @@
                             r300->rws->get_value(r300->rws, R300_VID_DRM_2_3_0);
     unsigned short_count;
     int buffer_offset = 0, index_offset = 0; /* for index bias emulation */
+    boolean translate = FALSE;
 
     if (r300->skip_rendering) {
         return;
     }
 
-    if (r300->incompatible_vb_layout ||
-        r300->velems->incompatible_layout) {
-        return;
-    }
-
     if (!u_trim_pipe_prim(mode, &count)) {
         return;
     }
 
+    /* Index buffer range checking. */
+    if ((start + count) * indexSize > indexBuffer->width0) {
+        fprintf(stderr, "r300: Invalid index buffer range. Skipping rendering.\n");
+        return;
+    }
+
+    /* Set up fallback for incompatible vertex layout if needed. */
+    if (r300->incompatible_vb_layout || r300->velems->incompatible_layout) {
+        r300_begin_vertex_translate(r300);
+        translate = TRUE;
+    }
+
     if (indexBias && !index_bias_supported(r300)) {
         r300_split_index_bias(r300, indexBias, &buffer_offset, &index_offset);
     }
 
-    /* Rebuild the index buffer if needed. */
-    switch (indexSize) {
-        case 1:
-            r300_shorten_ubyte_elts(r300, &indexBuffer, index_offset, start, count);
-            indexSize = 2;
-            start = 0;
-            break;
-
-        case 2:
-            if (start % 2 != 0 || index_offset) {
-                r300_rebuild_ushort_elts(r300, &indexBuffer, index_offset, start, count);
-                start = 0;
-            }
-            break;
-
-        case 4:
-            if (index_offset) {
-                r300_rebuild_uint_elts(r300, &indexBuffer, index_offset, start, count);
-                start = 0;
-            }
-            break;
-    }
+    r300_translate_index_buffer(r300, &indexBuffer, &indexSize, index_offset,
+                                &start, count);
 
     r300_update_derived_state(r300);
     r300_upload_index_buffer(r300, &indexBuffer, indexSize, start, count);
@@ -681,6 +562,10 @@
     if (indexBuffer != orgIndexBuffer) {
         pipe_resource_reference( &indexBuffer, NULL );
     }
+
+    if (translate) {
+        r300_end_vertex_translate(r300);
+    }
 }
 
 /* Simple helpers for context setup. Should probably be moved to util. */
@@ -704,18 +589,20 @@
                             count > 65536 &&
                             r300->rws->get_value(r300->rws, R300_VID_DRM_2_3_0);
     unsigned short_count;
+    boolean translate = FALSE;
 
     if (r300->skip_rendering) {
         return;
     }
 
-    if (r300->incompatible_vb_layout ||
-        r300->velems->incompatible_layout) {
+    if (!u_trim_pipe_prim(mode, &count)) {
         return;
     }
 
-    if (!u_trim_pipe_prim(mode, &count)) {
-        return;
+    /* Set up fallback for incompatible vertex layout if needed. */
+    if (r300->incompatible_vb_layout || r300->velems->incompatible_layout) {
+        r300_begin_vertex_translate(r300);
+        translate = TRUE;
     }
 
     r300_update_derived_state(r300);
@@ -747,6 +634,10 @@
         }
 	u_upload_flush(r300->upload_vb);
     }
+
+    if (translate) {
+        r300_end_vertex_translate(r300);
+    }
 }
 
 /****************************************************************************
@@ -1026,7 +917,7 @@
     unsigned max_index = (r300render->vbo_size - r300render->vbo_offset) /
                          (r300render->r300->vertex_info.size * 4) - 1;
     unsigned short_count;
-    struct r300_cs_info cs_info;
+    unsigned free_dwords;
 
     CS_LOCALS(r300);
 
@@ -1039,9 +930,9 @@
         NULL, 256, 0, 0, &end_cs_dwords);
 
     while (count) {
-        r300->rws->get_cs_info(r300->rws, &cs_info);
+        free_dwords = r300->rws->get_cs_free_dwords(r300->rws);
 
-        short_count = MIN2(count, (cs_info.free - end_cs_dwords - 6) * 2);
+        short_count = MIN2(count, (free_dwords - end_cs_dwords - 6) * 2);
 
         BEGIN_CS(6 + (short_count+1)/2);
         OUT_CS_REG(R300_GA_COLOR_CONTROL,
@@ -1127,132 +1018,45 @@
 }
 
 /****************************************************************************
- * Two-sided stencil reference value fallback. It's designed to be as much
- * separate from rest of the driver as possible.
+ *                         End of SW TCL functions                          *
  ***************************************************************************/
 
-struct r300_stencilref_context {
-    void (*draw_arrays)(struct pipe_context *pipe,
-                        unsigned mode, unsigned start, unsigned count);
-
-    void (*draw_range_elements)(
-        struct pipe_context *pipe, struct pipe_resource *indexBuffer,
-        unsigned indexSize, int indexBias, unsigned minIndex, unsigned maxIndex,
-        unsigned mode, unsigned start, unsigned count);
-
-    uint32_t rs_cull_mode;
-    uint32_t zb_stencilrefmask;
-    ubyte ref_value_front;
-};
-
-static boolean r300_stencilref_needed(struct r300_context *r300)
+static void r300_resource_resolve(struct pipe_context* pipe,
+                                  struct pipe_resource* dest,
+                                  struct pipe_subresource subdest,
+                                  struct pipe_resource* src,
+                                  struct pipe_subresource subsrc)
 {
-    struct r300_dsa_state *dsa = (struct r300_dsa_state*)r300->dsa_state.state;
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_aa_state *aa = (struct r300_aa_state*)r300->aa_state.state;
+    struct pipe_surface* srcsurf = src->screen->get_tex_surface(src->screen,
+            src, subsrc.face, subsrc.level, 0, 0);
+    float color[] = {0, 0, 0, 0};
 
-    return dsa->two_sided_stencil_ref ||
-           (dsa->two_sided &&
-            r300->stencil_ref.ref_value[0] != r300->stencil_ref.ref_value[1]);
-}
+    DBG(r300, DBG_DRAW, "r300: Resolving resource...\n");
 
-/* Set drawing for front faces. */
-static void r300_stencilref_begin(struct r300_context *r300)
-{
-    struct r300_stencilref_context *sr = r300->stencilref_fallback;
-    struct r300_rs_state *rs = (struct r300_rs_state*)r300->rs_state.state;
-    struct r300_dsa_state *dsa = (struct r300_dsa_state*)r300->dsa_state.state;
+    /* Enable AA resolve. */
+    aa->dest = r300_surface(
+            dest->screen->get_tex_surface(dest->screen, dest, subdest.face,
+                                          subdest.level, 0, 0));
 
-    /* Save state. */
-    sr->rs_cull_mode = rs->cull_mode;
-    sr->zb_stencilrefmask = dsa->stencil_ref_mask;
-    sr->ref_value_front = r300->stencil_ref.ref_value[0];
+    aa->aaresolve_ctl =
+        R300_RB3D_AARESOLVE_CTL_AARESOLVE_MODE_RESOLVE |
+        R300_RB3D_AARESOLVE_CTL_AARESOLVE_ALPHA_AVERAGE;
+    r300->aa_state.size = 12;
+    r300->aa_state.dirty = TRUE;
 
-    /* We *cull* pixels, therefore no need to mask out the bits. */
-    rs->cull_mode |= R300_CULL_BACK;
+    /* Resolve the surface. */
+    r300->context.clear_render_target(pipe,
+        srcsurf, color, 0, 0, src->width0, src->height0);
 
-    r300->rs_state.dirty = TRUE;
-}
+    /* Disable AA resolve. */
+    aa->aaresolve_ctl = 0;
+    r300->aa_state.size = 4;
+    r300->aa_state.dirty = TRUE;
 
-/* Set drawing for back faces. */
-static void r300_stencilref_switch_side(struct r300_context *r300)
-{
-    struct r300_stencilref_context *sr = r300->stencilref_fallback;
-    struct r300_rs_state *rs = (struct r300_rs_state*)r300->rs_state.state;
-    struct r300_dsa_state *dsa = (struct r300_dsa_state*)r300->dsa_state.state;
-
-    rs->cull_mode = sr->rs_cull_mode | R300_CULL_FRONT;
-    dsa->stencil_ref_mask = dsa->stencil_ref_bf;
-    r300->stencil_ref.ref_value[0] = r300->stencil_ref.ref_value[1];
-
-    r300->rs_state.dirty = TRUE;
-    r300->dsa_state.dirty = TRUE;
-}
-
-/* Restore the original state. */
-static void r300_stencilref_end(struct r300_context *r300)
-{
-    struct r300_stencilref_context *sr = r300->stencilref_fallback;
-    struct r300_rs_state *rs = (struct r300_rs_state*)r300->rs_state.state;
-    struct r300_dsa_state *dsa = (struct r300_dsa_state*)r300->dsa_state.state;
-
-    /* Restore state. */
-    rs->cull_mode = sr->rs_cull_mode;
-    dsa->stencil_ref_mask = sr->zb_stencilrefmask;
-    r300->stencil_ref.ref_value[0] = sr->ref_value_front;
-
-    r300->rs_state.dirty = TRUE;
-    r300->dsa_state.dirty = TRUE;
-}
-
-static void r300_stencilref_draw_arrays(struct pipe_context *pipe, unsigned mode,
-                                        unsigned start, unsigned count)
-{
-    struct r300_context *r300 = r300_context(pipe);
-    struct r300_stencilref_context *sr = r300->stencilref_fallback;
-
-    if (!r300_stencilref_needed(r300)) {
-        sr->draw_arrays(pipe, mode, start, count);
-    } else {
-        r300_stencilref_begin(r300);
-        sr->draw_arrays(pipe, mode, start, count);
-        r300_stencilref_switch_side(r300);
-        sr->draw_arrays(pipe, mode, start, count);
-        r300_stencilref_end(r300);
-    }
-}
-
-static void r300_stencilref_draw_range_elements(
-    struct pipe_context *pipe, struct pipe_resource *indexBuffer,
-    unsigned indexSize, int indexBias, unsigned minIndex, unsigned maxIndex,
-    unsigned mode, unsigned start, unsigned count)
-{
-    struct r300_context *r300 = r300_context(pipe);
-    struct r300_stencilref_context *sr = r300->stencilref_fallback;
-
-    if (!r300_stencilref_needed(r300)) {
-        sr->draw_range_elements(pipe, indexBuffer, indexSize, indexBias,
-                                minIndex, maxIndex, mode, start, count);
-    } else {
-        r300_stencilref_begin(r300);
-        sr->draw_range_elements(pipe, indexBuffer, indexSize, indexBias,
-                                minIndex, maxIndex, mode, start, count);
-        r300_stencilref_switch_side(r300);
-        sr->draw_range_elements(pipe, indexBuffer, indexSize, indexBias,
-                                minIndex, maxIndex, mode, start, count);
-        r300_stencilref_end(r300);
-    }
-}
-
-static void r300_plug_in_stencil_ref_fallback(struct r300_context *r300)
-{
-    r300->stencilref_fallback = CALLOC_STRUCT(r300_stencilref_context);
-
-    /* Save original draw functions. */
-    r300->stencilref_fallback->draw_arrays = r300->context.draw_arrays;
-    r300->stencilref_fallback->draw_range_elements = r300->context.draw_range_elements;
-
-    /* Override the draw functions. */
-    r300->context.draw_arrays = r300_stencilref_draw_arrays;
-    r300->context.draw_range_elements = r300_stencilref_draw_range_elements;
+    pipe_surface_reference((struct pipe_surface**)&srcsurf, NULL);
+    pipe_surface_reference((struct pipe_surface**)&aa->dest, NULL);
 }
 
 void r300_init_render_functions(struct r300_context *r300)
@@ -1269,7 +1073,9 @@
         r300->context.draw_range_elements = r300_swtcl_draw_range_elements;
     }
 
-    /* Plug in two-sided stencil reference value fallback if needed. */
+    r300->context.resource_resolve = r300_resource_resolve;
+
+    /* Plug in the two-sided stencil reference value fallback if needed. */
     if (!r300->screen->caps.is_r500)
         r300_plug_in_stencil_ref_fallback(r300);
 }
diff --git a/src/gallium/drivers/r300/r300_render_stencilref.c b/src/gallium/drivers/r300/r300_render_stencilref.c
new file mode 100644
index 0000000..d509ded
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_render_stencilref.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/**
+ * The two-sided stencil reference value fallback for r3xx-r4xx chips.
+ * These chips support two-sided stencil functions but they do not support
+ * a two-sided reference value.
+ *
+ * The functions below split every draw call which uses the two-sided
+ * reference value into two draw calls -- the first one renders front faces
+ * and the second renders back faces with the other reference value.
+ */
+
+#include "r300_context.h"
+#include "r300_reg.h"
+
+struct r300_stencilref_context {
+    void (*draw_arrays)(struct pipe_context *pipe,
+                        unsigned mode, unsigned start, unsigned count);
+
+    void (*draw_range_elements)(
+        struct pipe_context *pipe, struct pipe_resource *indexBuffer,
+        unsigned indexSize, int indexBias, unsigned minIndex, unsigned maxIndex,
+        unsigned mode, unsigned start, unsigned count);
+
+    uint32_t rs_cull_mode;
+    uint32_t zb_stencilrefmask;
+    ubyte ref_value_front;
+};
+
+static boolean r300_stencilref_needed(struct r300_context *r300)
+{
+    struct r300_dsa_state *dsa = (struct r300_dsa_state*)r300->dsa_state.state;
+
+    return dsa->two_sided_stencil_ref ||
+           (dsa->two_sided &&
+            r300->stencil_ref.ref_value[0] != r300->stencil_ref.ref_value[1]);
+}
+
+/* Set drawing for front faces. */
+static void r300_stencilref_begin(struct r300_context *r300)
+{
+    struct r300_stencilref_context *sr = r300->stencilref_fallback;
+    struct r300_rs_state *rs = (struct r300_rs_state*)r300->rs_state.state;
+    struct r300_dsa_state *dsa = (struct r300_dsa_state*)r300->dsa_state.state;
+
+    /* Save state. */
+    sr->rs_cull_mode = rs->cull_mode;
+    sr->zb_stencilrefmask = dsa->stencil_ref_mask;
+    sr->ref_value_front = r300->stencil_ref.ref_value[0];
+
+    /* We *cull* pixels, therefore no need to mask out the bits. */
+    rs->cull_mode |= R300_CULL_BACK;
+
+    r300->rs_state.dirty = TRUE;
+}
+
+/* Set drawing for back faces. */
+static void r300_stencilref_switch_side(struct r300_context *r300)
+{
+    struct r300_stencilref_context *sr = r300->stencilref_fallback;
+    struct r300_rs_state *rs = (struct r300_rs_state*)r300->rs_state.state;
+    struct r300_dsa_state *dsa = (struct r300_dsa_state*)r300->dsa_state.state;
+
+    rs->cull_mode = sr->rs_cull_mode | R300_CULL_FRONT;
+    dsa->stencil_ref_mask = dsa->stencil_ref_bf;
+    r300->stencil_ref.ref_value[0] = r300->stencil_ref.ref_value[1];
+
+    r300->rs_state.dirty = TRUE;
+    r300->dsa_state.dirty = TRUE;
+}
+
+/* Restore the original state. */
+static void r300_stencilref_end(struct r300_context *r300)
+{
+    struct r300_stencilref_context *sr = r300->stencilref_fallback;
+    struct r300_rs_state *rs = (struct r300_rs_state*)r300->rs_state.state;
+    struct r300_dsa_state *dsa = (struct r300_dsa_state*)r300->dsa_state.state;
+
+    /* Restore state. */
+    rs->cull_mode = sr->rs_cull_mode;
+    dsa->stencil_ref_mask = sr->zb_stencilrefmask;
+    r300->stencil_ref.ref_value[0] = sr->ref_value_front;
+
+    r300->rs_state.dirty = TRUE;
+    r300->dsa_state.dirty = TRUE;
+}
+
+static void r300_stencilref_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                                        unsigned start, unsigned count)
+{
+    struct r300_context *r300 = r300_context(pipe);
+    struct r300_stencilref_context *sr = r300->stencilref_fallback;
+
+    if (!r300_stencilref_needed(r300)) {
+        sr->draw_arrays(pipe, mode, start, count);
+    } else {
+        r300_stencilref_begin(r300);
+        sr->draw_arrays(pipe, mode, start, count);
+        r300_stencilref_switch_side(r300);
+        sr->draw_arrays(pipe, mode, start, count);
+        r300_stencilref_end(r300);
+    }
+}
+
+static void r300_stencilref_draw_range_elements(
+    struct pipe_context *pipe, struct pipe_resource *indexBuffer,
+    unsigned indexSize, int indexBias, unsigned minIndex, unsigned maxIndex,
+    unsigned mode, unsigned start, unsigned count)
+{
+    struct r300_context *r300 = r300_context(pipe);
+    struct r300_stencilref_context *sr = r300->stencilref_fallback;
+
+    if (!r300_stencilref_needed(r300)) {
+        sr->draw_range_elements(pipe, indexBuffer, indexSize, indexBias,
+                                minIndex, maxIndex, mode, start, count);
+    } else {
+        r300_stencilref_begin(r300);
+        sr->draw_range_elements(pipe, indexBuffer, indexSize, indexBias,
+                                minIndex, maxIndex, mode, start, count);
+        r300_stencilref_switch_side(r300);
+        sr->draw_range_elements(pipe, indexBuffer, indexSize, indexBias,
+                                minIndex, maxIndex, mode, start, count);
+        r300_stencilref_end(r300);
+    }
+}
+
+void r300_plug_in_stencil_ref_fallback(struct r300_context *r300)
+{
+    r300->stencilref_fallback = CALLOC_STRUCT(r300_stencilref_context);
+
+    /* Save original draw functions. */
+    r300->stencilref_fallback->draw_arrays = r300->context.draw_arrays;
+    r300->stencilref_fallback->draw_range_elements = r300->context.draw_range_elements;
+
+    /* Override the draw functions. */
+    r300->context.draw_arrays = r300_stencilref_draw_arrays;
+    r300->context.draw_range_elements = r300_stencilref_draw_range_elements;
+}
diff --git a/src/gallium/drivers/r300/r300_render_translate.c b/src/gallium/drivers/r300/r300_render_translate.c
new file mode 100644
index 0000000..0ea11e5
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_render_translate.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/**
+ * The functions below translate vertex and index buffers to the layout
+ * compatible with the hardware, so that all vertex and index fetches are
+ * DWORD-aligned and all used vertex and index formats are supported.
+ * For indices, an optional index offset is added to each index.
+ */
+
+#include "r300_context.h"
+#include "translate/translate.h"
+
+void r300_begin_vertex_translate(struct r300_context *r300)
+{
+    struct pipe_context *pipe = &r300->context;
+    struct translate_key key = {0};
+    struct translate_element *te;
+    unsigned tr_elem_index[PIPE_MAX_ATTRIBS] = {0};
+    struct translate *tr;
+    struct r300_vertex_element_state *ve = r300->velems;
+    boolean vb_translated[PIPE_MAX_ATTRIBS] = {0};
+    void *vb_map[PIPE_MAX_ATTRIBS] = {0}, *out_map;
+    struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = {0}, *out_transfer;
+    struct pipe_resource *out_buffer;
+    unsigned i, num_verts;
+
+    /* Initialize the translate key, i.e. the recipe how vertices should be
+     * translated. */
+    for (i = 0; i < ve->count; i++) {
+        struct pipe_vertex_buffer *vb =
+                &r300->vertex_buffer[ve->velem[i].vertex_buffer_index];
+        enum pipe_format output_format = ve->hw_format[i];
+        unsigned output_format_size = ve->hw_format_size[i];
+
+        /* Check for support. */
+        if (ve->velem[i].src_format == ve->hw_format[i] &&
+            (vb->buffer_offset + ve->velem[i].src_offset) % 4 == 0 &&
+            vb->stride % 4 == 0) {
+            continue;
+        }
+
+        /* Workaround for translate: output floats instead of halfs. */
+        switch (output_format) {
+            case PIPE_FORMAT_R16_FLOAT:
+                output_format = PIPE_FORMAT_R32_FLOAT;
+                output_format_size = 4;
+                break;
+            case PIPE_FORMAT_R16G16_FLOAT:
+                output_format = PIPE_FORMAT_R32G32_FLOAT;
+                output_format_size = 8;
+                break;
+            case PIPE_FORMAT_R16G16B16_FLOAT:
+                output_format = PIPE_FORMAT_R32G32B32_FLOAT;
+                output_format_size = 12;
+                break;
+            case PIPE_FORMAT_R16G16B16A16_FLOAT:
+                output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+                output_format_size = 16;
+                break;
+            default:;
+        }
+
+        /* Add this vertex element. */
+        te = &key.element[key.nr_elements];
+        /*te->type;
+        te->instance_divisor;*/
+        te->input_buffer = ve->velem[i].vertex_buffer_index;
+        te->input_format = ve->velem[i].src_format;
+        te->input_offset = vb->buffer_offset + ve->velem[i].src_offset;
+        te->output_format = output_format;
+        te->output_offset = key.output_stride;
+
+        key.output_stride += output_format_size;
+        vb_translated[ve->velem[i].vertex_buffer_index] = TRUE;
+        tr_elem_index[i] = key.nr_elements;
+        key.nr_elements++;
+    }
+
+    /* Get a translate object. */
+    tr = translate_cache_find(r300->tran.translate_cache, &key);
+
+    /* Map buffers we want to translate. */
+    for (i = 0; i < r300->vertex_buffer_count; i++) {
+        if (vb_translated[i]) {
+            struct pipe_vertex_buffer *vb = &r300->vertex_buffer[i];
+
+            vb_map[i] = pipe_buffer_map(pipe, vb->buffer,
+                                        PIPE_TRANSFER_READ, &vb_transfer[i]);
+
+            tr->set_buffer(tr, i, vb_map[i], vb->stride, vb->max_index);
+        }
+    }
+
+    /* Create and map the output buffer. */
+    num_verts = r300->vertex_buffer_max_index + 1;
+
+    out_buffer = pipe_buffer_create(&r300->screen->screen,
+                                    PIPE_BIND_VERTEX_BUFFER,
+                                    key.output_stride * num_verts);
+
+    out_map = pipe_buffer_map(pipe, out_buffer, PIPE_TRANSFER_WRITE,
+                              &out_transfer);
+
+    /* Translate. */
+    tr->run(tr, 0, num_verts, 0, out_map);
+
+    /* Unmap all buffers. */
+    for (i = 0; i < r300->vertex_buffer_count; i++) {
+        if (vb_translated[i]) {
+            pipe_buffer_unmap(pipe, r300->vertex_buffer[i].buffer,
+                              vb_transfer[i]);
+        }
+    }
+
+    pipe_buffer_unmap(pipe, out_buffer, out_transfer);
+
+    /* Setup the new vertex buffer in the first free slot. */
+    for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+        struct pipe_vertex_buffer *vb = &r300->vertex_buffer[i];
+
+        if (!vb->buffer) {
+            pipe_resource_reference(&vb->buffer, out_buffer);
+            vb->buffer_offset = 0;
+            vb->max_index = num_verts - 1;
+            vb->stride = key.output_stride;
+            r300->tran.vb_slot = i;
+            break;
+        }
+    }
+
+    /* Save and replace vertex elements. */
+    {
+        struct pipe_vertex_element new_velems[PIPE_MAX_ATTRIBS];
+
+        r300->tran.saved_velems = r300->velems;
+
+        for (i = 0; i < ve->count; i++) {
+            if (vb_translated[ve->velem[i].vertex_buffer_index]) {
+                te = &key.element[tr_elem_index[i]];
+                new_velems[i].instance_divisor = ve->velem[i].instance_divisor;
+                new_velems[i].src_format = te->output_format;
+                new_velems[i].src_offset = te->output_offset;
+                new_velems[i].vertex_buffer_index = r300->tran.vb_slot;
+            } else {
+                memcpy(&new_velems[i], &ve->velem[i],
+                       sizeof(struct pipe_vertex_element));
+            }
+        }
+
+        r300->tran.new_velems =
+            pipe->create_vertex_elements_state(pipe, ve->count, new_velems);
+        pipe->bind_vertex_elements_state(pipe, r300->tran.new_velems);
+    }
+
+    pipe_resource_reference(&out_buffer, NULL);
+}
+
+void r300_end_vertex_translate(struct r300_context *r300)
+{
+    struct pipe_context *pipe = &r300->context;
+
+    /* Restore vertex elements. */
+    pipe->bind_vertex_elements_state(pipe, r300->tran.saved_velems);
+    pipe->delete_vertex_elements_state(pipe, r300->tran.new_velems);
+
+    /* Delete the now-unused VBO. */
+    pipe_resource_reference(&r300->vertex_buffer[r300->tran.vb_slot].buffer,
+                            NULL);
+}
+
+static void r300_shorten_ubyte_elts(struct r300_context* r300,
+                                    struct pipe_resource** elts,
+                                    int index_bias,
+                                    unsigned start,
+                                    unsigned count)
+{
+    struct pipe_context* context = &r300->context;
+    struct pipe_screen* screen = r300->context.screen;
+    struct pipe_resource* new_elts;
+    unsigned char *in_map;
+    unsigned short *out_map;
+    struct pipe_transfer *src_transfer, *dst_transfer;
+    unsigned i;
+
+    new_elts = pipe_buffer_create(screen,
+                                  PIPE_BIND_INDEX_BUFFER,
+                                  2 * count);
+
+    in_map = pipe_buffer_map(context, *elts, PIPE_TRANSFER_READ, &src_transfer);
+    out_map = pipe_buffer_map(context, new_elts, PIPE_TRANSFER_WRITE, &dst_transfer);
+
+    in_map += start;
+
+    for (i = 0; i < count; i++) {
+        *out_map = (unsigned short)(*in_map + index_bias);
+        in_map++;
+        out_map++;
+    }
+
+    pipe_buffer_unmap(context, *elts, src_transfer);
+    pipe_buffer_unmap(context, new_elts, dst_transfer);
+
+    *elts = new_elts;
+}
+
+static void r300_rebuild_ushort_elts(struct r300_context *r300,
+                                     struct pipe_resource **elts,
+                                     int index_bias,
+                                     unsigned start, unsigned count)
+{
+    struct pipe_context *context = &r300->context;
+    struct pipe_transfer *in_transfer = NULL;
+    struct pipe_transfer *out_transfer = NULL;
+    struct pipe_resource *new_elts;
+    unsigned short *in_map;
+    unsigned short *out_map;
+    unsigned i;
+
+    new_elts = pipe_buffer_create(context->screen,
+                                  PIPE_BIND_INDEX_BUFFER,
+                                  2 * count);
+
+    in_map = pipe_buffer_map(context, *elts,
+                             PIPE_TRANSFER_READ, &in_transfer);
+    out_map = pipe_buffer_map(context, new_elts,
+                              PIPE_TRANSFER_WRITE, &out_transfer);
+
+    in_map += start;
+    for (i = 0; i < count; i++) {
+        *out_map = (unsigned short)(*in_map + index_bias);
+        in_map++;
+        out_map++;
+    }
+
+    pipe_buffer_unmap(context, *elts, in_transfer);
+    pipe_buffer_unmap(context, new_elts, out_transfer);
+
+    *elts = new_elts;
+}
+
+static void r300_rebuild_uint_elts(struct r300_context *r300,
+                                   struct pipe_resource **elts,
+                                   int index_bias,
+                                   unsigned start, unsigned count)
+{
+    struct pipe_context *context = &r300->context;
+    struct pipe_transfer *in_transfer = NULL;
+    struct pipe_transfer *out_transfer = NULL;
+    struct pipe_resource *new_elts;
+    unsigned int *in_map;
+    unsigned int *out_map;
+    unsigned i;
+
+    new_elts = pipe_buffer_create(context->screen,
+                                  PIPE_BIND_INDEX_BUFFER,
+                                  2 * count);
+
+    in_map = pipe_buffer_map(context, *elts,
+                             PIPE_TRANSFER_READ, &in_transfer);
+    out_map = pipe_buffer_map(context, new_elts,
+                              PIPE_TRANSFER_WRITE, &out_transfer);
+
+    in_map += start;
+    for (i = 0; i < count; i++) {
+        *out_map = (unsigned int)(*in_map + index_bias);
+        in_map++;
+        out_map++;
+    }
+
+    pipe_buffer_unmap(context, *elts, in_transfer);
+    pipe_buffer_unmap(context, new_elts, out_transfer);
+
+    *elts = new_elts;
+}
+
+void r300_translate_index_buffer(struct r300_context *r300,
+                                 struct pipe_resource **index_buffer,
+                                 unsigned *index_size, unsigned index_offset,
+                                 unsigned *start, unsigned count)
+{
+    switch (*index_size) {
+        case 1:
+            r300_shorten_ubyte_elts(r300, index_buffer, index_offset, *start, count);
+            *index_size = 2;
+            *start = 0;
+            break;
+
+        case 2:
+            if (*start % 2 != 0 || index_offset) {
+                r300_rebuild_ushort_elts(r300, index_buffer, index_offset, *start, count);
+                *start = 0;
+            }
+            break;
+
+        case 4:
+            if (index_offset) {
+                r300_rebuild_uint_elts(r300, index_buffer, index_offset, *start, count);
+                *start = 0;
+            }
+            break;
+    }
+}
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index f0c562b..d3d36a7 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -269,15 +269,24 @@
                             format == PIPE_FORMAT_R16G16B16_FLOAT ||
                             format == PIPE_FORMAT_R16G16B16A16_FLOAT;
 
-    if (target >= PIPE_MAX_TEXTURE_TYPES) {
-        fprintf(stderr, "r300: Implementation error: Received bogus texture "
-            "target %d in %s\n", target, __FUNCTION__);
-        return FALSE;
+    switch (sample_count) {
+        case 0:
+        case 1:
+            break;
+        case 2:
+        case 3:
+        case 4:
+        case 6:
+            if (usage != PIPE_BIND_RENDER_TARGET ||
+                !util_format_is_rgba8_variant(
+                    util_format_description(format))) {
+                return FALSE;
+            }
+            break;
+        default:
+            return FALSE;
     }
 
-   if (sample_count > 1)
-      return FALSE;
-
     /* Check sampler format support. */
     if ((usage & PIPE_BIND_SAMPLER_VIEW) &&
         /* Z24 cannot be sampled from on non-r5xx. */
diff --git a/src/gallium/drivers/r300/r300_screen.h b/src/gallium/drivers/r300/r300_screen.h
index 2949202..29cd5db 100644
--- a/src/gallium/drivers/r300/r300_screen.h
+++ b/src/gallium/drivers/r300/r300_screen.h
@@ -65,7 +65,7 @@
 /* Logging. */
 #define DBG_FP          (1 << 1)
 #define DBG_VP          (1 << 2)
-#define DBG_CS          (1 << 3)
+/* The bit (1 << 3) is unused. */
 #define DBG_DRAW        (1 << 4)
 #define DBG_TEX         (1 << 5)
 #define DBG_TEXALLOC    (1 << 6)
@@ -76,6 +76,7 @@
 #define DBG_ANISOHQ     (1 << 16)
 #define DBG_NO_TILING   (1 << 17)
 #define DBG_NO_IMMD     (1 << 18)
+#define DBG_FAKE_OCC    (1 << 19)
 /* Statistics. */
 #define DBG_STATS       (1 << 24)
 /*@}*/
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c
index 44179f1..7959e6a 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.c
+++ b/src/gallium/drivers/r300/r300_screen_buffer.c
@@ -93,25 +93,28 @@
     enum pipe_error ret = PIPE_OK;
     int i, nr;
 
-    nr = r300->vertex_buffer_count;
+    nr = r300->velems->count;
 
     for (i = 0; i < nr; i++) {
-	if (r300_buffer_is_user_buffer(r300->vertex_buffer[i].buffer)) {
-	    struct pipe_resource *upload_buffer = NULL;
-	    unsigned offset = 0; /*r300->vertex_buffer[i].buffer_offset * 4;*/
-	    unsigned size = r300->vertex_buffer[i].buffer->width0;
-	    unsigned upload_offset;
-	    ret = u_upload_buffer(r300->upload_vb,
-				  offset, size,
-				  r300->vertex_buffer[i].buffer,
-				  &upload_offset, &upload_buffer);
-	    if (ret)
-		return ret;
+        struct pipe_vertex_buffer *vb =
+            &r300->vertex_buffer[r300->velems->velem[i].vertex_buffer_index];
 
-	    pipe_resource_reference(&r300->vertex_buffer[i].buffer, NULL);
-	    r300->vertex_buffer[i].buffer = upload_buffer;
-	    r300->vertex_buffer[i].buffer_offset = upload_offset;
-	}
+        if (r300_buffer_is_user_buffer(vb->buffer)) {
+            struct pipe_resource *upload_buffer = NULL;
+            unsigned offset = 0; /*vb->buffer_offset * 4;*/
+            unsigned size = vb->buffer->width0;
+            unsigned upload_offset;
+            ret = u_upload_buffer(r300->upload_vb,
+                                  offset, size,
+                                  vb->buffer,
+                                  &upload_offset, &upload_buffer);
+            if (ret)
+                return ret;
+
+            pipe_resource_reference(&vb->buffer, NULL);
+            vb->buffer = upload_buffer;
+            vb->buffer_offset = upload_offset;
+        }
     }
     return ret;
 }
@@ -261,10 +264,6 @@
     rbuf->b.vtbl = &r300_buffer_vtbl;
     pipe_reference_init(&rbuf->b.b.reference, 1);
     rbuf->b.b.screen = screen;
-
-    if (rbuf->b.b.bind & R300_BIND_OQBO)
-        alignment = 4096;
-
     rbuf->domain = R300_DOMAIN_GTT;
 
     rbuf->buf = r300screen->rws->buffer_create(r300screen->rws,
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.h b/src/gallium/drivers/r300/r300_screen_buffer.h
index 87b42b9..ff35585 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.h
+++ b/src/gallium/drivers/r300/r300_screen_buffer.h
@@ -116,25 +116,4 @@
     return rws->add_buffer(rws, tex->buffer, rd, wr);
 }
 
-static INLINE void r300_buffer_write_reloc(struct r300_winsys_screen *rws,
-				      struct r300_buffer *buf,
-				      enum r300_buffer_domain rd,
-                                      enum r300_buffer_domain wd,
-                                      uint32_t flags)
-{
-    if (!buf->buf)
-	return;
-
-    rws->write_cs_reloc(rws, buf->buf, rd, wd, flags);
-}
-
-static INLINE void r300_texture_write_reloc(struct r300_winsys_screen *rws,
-					    struct r300_texture *texture,
-					    enum r300_buffer_domain rd,
-                                            enum r300_buffer_domain wd,
-                                            uint32_t flags)
-{
-    rws->write_cs_reloc(rws, texture->buffer, rd, wd, flags);
-}
-
 #endif
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index d19563c..9c0f877 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -31,12 +31,12 @@
 
 #include "pipe/p_config.h"
 
+#include "r300_cb.h"
 #include "r300_context.h"
 #include "r300_emit.h"
 #include "r300_reg.h"
 #include "r300_screen.h"
 #include "r300_screen_buffer.h"
-#include "r300_state.h"
 #include "r300_state_inlines.h"
 #include "r300_fs.h"
 #include "r300_texture.h"
@@ -183,6 +183,12 @@
 {
     struct r300_screen* r300screen = r300_screen(pipe->screen);
     struct r300_blend_state* blend = CALLOC_STRUCT(r300_blend_state);
+    uint32_t blend_control = 0;       /* R300_RB3D_CBLEND: 0x4e04 */
+    uint32_t alpha_blend_control = 0; /* R300_RB3D_ABLEND: 0x4e08 */
+    uint32_t color_channel_mask = 0;  /* R300_RB3D_COLOR_CHANNEL_MASK: 0x4e0c */
+    uint32_t rop = 0;                 /* R300_RB3D_ROPCNTL: 0x4e18 */
+    uint32_t dither = 0;              /* R300_RB3D_DITHER_CTL: 0x4e50 */
+    CB_LOCALS;
 
     if (state->rt[0].blend_enable)
     {
@@ -196,7 +202,7 @@
 
         /* despite the name, ALPHA_BLEND_ENABLE has nothing to do with alpha,
          * this is just the crappy D3D naming */
-        blend->blend_control = R300_ALPHA_BLEND_ENABLE |
+        blend_control = R300_ALPHA_BLEND_ENABLE |
             r300_translate_blend_function(eqRGB) |
             ( r300_translate_blend_factor(srcRGB) << R300_SRC_BLEND_SHIFT) |
             ( r300_translate_blend_factor(dstRGB) << R300_DST_BLEND_SHIFT);
@@ -220,7 +226,7 @@
             srcA == PIPE_BLENDFACTOR_INV_DST_ALPHA ||
             srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) {
             /* Enable reading from the colorbuffer. */
-            blend->blend_control |= R300_READ_ENABLE;
+            blend_control |= R300_READ_ENABLE;
 
             if (r300screen->caps.is_r500) {
                 /* Optimization: Depending on incoming pixels, we can
@@ -233,7 +239,7 @@
                         (dstA == PIPE_BLENDFACTOR_SRC_COLOR ||
                          dstA == PIPE_BLENDFACTOR_SRC_ALPHA ||
                          dstA == PIPE_BLENDFACTOR_ZERO)) {
-                         blend->blend_control |= R500_SRC_ALPHA_0_NO_READ;
+                         blend_control |= R500_SRC_ALPHA_0_NO_READ;
                     }
 
                     /* Disable reading if SRC_ALPHA == 1. */
@@ -242,7 +248,7 @@
                         (dstA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
                          dstA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
                          dstA == PIPE_BLENDFACTOR_ZERO)) {
-                         blend->blend_control |= R500_SRC_ALPHA_1_NO_READ;
+                         blend_control |= R500_SRC_ALPHA_1_NO_READ;
                     }
                 }
             }
@@ -272,31 +278,31 @@
              * pixels.
              */
             if (blend_discard_if_src_alpha_0(srcRGB, srcA, dstRGB, dstA)) {
-                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_ALPHA_0;
+                blend_control |= R300_DISCARD_SRC_PIXELS_SRC_ALPHA_0;
             } else if (blend_discard_if_src_alpha_1(srcRGB, srcA,
                                                     dstRGB, dstA)) {
-                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_ALPHA_1;
+                blend_control |= R300_DISCARD_SRC_PIXELS_SRC_ALPHA_1;
             } else if (blend_discard_if_src_color_0(srcRGB, srcA,
                                                     dstRGB, dstA)) {
-                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_COLOR_0;
+                blend_control |= R300_DISCARD_SRC_PIXELS_SRC_COLOR_0;
             } else if (blend_discard_if_src_color_1(srcRGB, srcA,
                                                     dstRGB, dstA)) {
-                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_COLOR_1;
+                blend_control |= R300_DISCARD_SRC_PIXELS_SRC_COLOR_1;
             } else if (blend_discard_if_src_alpha_color_0(srcRGB, srcA,
                                                           dstRGB, dstA)) {
-                blend->blend_control |=
+                blend_control |=
                     R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_0;
             } else if (blend_discard_if_src_alpha_color_1(srcRGB, srcA,
                                                           dstRGB, dstA)) {
-                blend->blend_control |=
+                blend_control |=
                     R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_1;
             }
         }
 
         /* separate alpha */
         if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
-            blend->blend_control |= R300_SEPARATE_ALPHA_ENABLE;
-            blend->alpha_blend_control =
+            blend_control |= R300_SEPARATE_ALPHA_ENABLE;
+            alpha_blend_control =
                 r300_translate_blend_function(eqA) |
                 (r300_translate_blend_factor(srcA) << R300_SRC_BLEND_SHIFT) |
                 (r300_translate_blend_factor(dstA) << R300_DST_BLEND_SHIFT);
@@ -305,21 +311,21 @@
 
     /* PIPE_LOGICOP_* don't need to be translated, fortunately. */
     if (state->logicop_enable) {
-        blend->rop = R300_RB3D_ROPCNTL_ROP_ENABLE |
+        rop = R300_RB3D_ROPCNTL_ROP_ENABLE |
                 (state->logicop_func) << R300_RB3D_ROPCNTL_ROP_SHIFT;
     }
 
     /* Color channel masks for all MRTs. */
-    blend->color_channel_mask = bgra_cmask(state->rt[0].colormask);
+    color_channel_mask = bgra_cmask(state->rt[0].colormask);
     if (r300screen->caps.is_r500 && state->independent_blend_enable) {
         if (state->rt[1].blend_enable) {
-            blend->color_channel_mask |= bgra_cmask(state->rt[1].colormask) << 4;
+            color_channel_mask |= bgra_cmask(state->rt[1].colormask) << 4;
         }
         if (state->rt[2].blend_enable) {
-            blend->color_channel_mask |= bgra_cmask(state->rt[2].colormask) << 8;
+            color_channel_mask |= bgra_cmask(state->rt[2].colormask) << 8;
         }
         if (state->rt[3].blend_enable) {
-            blend->color_channel_mask |= bgra_cmask(state->rt[3].colormask) << 12;
+            color_channel_mask |= bgra_cmask(state->rt[3].colormask) << 12;
         }
     }
 
@@ -330,11 +336,31 @@
      * This could be revisited if we ever get quality or conformance hints.
      *
     if (state->dither) {
-        blend->dither = R300_RB3D_DITHER_CTL_DITHER_MODE_LUT |
+        dither = R300_RB3D_DITHER_CTL_DITHER_MODE_LUT |
                         R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_LUT;
     }
     */
 
+    /* Build a command buffer. */
+    BEGIN_CB(blend->cb, 8);
+    OUT_CB_REG(R300_RB3D_ROPCNTL, rop);
+    OUT_CB_REG_SEQ(R300_RB3D_CBLEND, 3);
+    OUT_CB(blend_control);
+    OUT_CB(alpha_blend_control);
+    OUT_CB(color_channel_mask);
+    OUT_CB_REG(R300_RB3D_DITHER_CTL, dither);
+    END_CB;
+
+    /* The same as above, but with no colorbuffer reads and writes. */
+    BEGIN_CB(blend->cb_no_readwrite, 8);
+    OUT_CB_REG(R300_RB3D_ROPCNTL, rop);
+    OUT_CB_REG_SEQ(R300_RB3D_CBLEND, 3);
+    OUT_CB(0);
+    OUT_CB(0);
+    OUT_CB(0);
+    OUT_CB_REG(R300_RB3D_DITHER_CTL, dither);
+    END_CB;
+
     return (void*)blend;
 }
 
@@ -368,20 +394,26 @@
     struct r300_context* r300 = r300_context(pipe);
     struct r300_blend_color_state* state =
         (struct r300_blend_color_state*)r300->blend_color_state.state;
-    union util_color uc;
+    CB_LOCALS;
 
-    util_pack_color(color->color, PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
-    state->blend_color = uc.ui;
+    if (r300->screen->caps.is_r500) {
+        /* XXX if FP16 blending is enabled, we should use the FP16 format */
+        BEGIN_CB(state->cb, 3);
+        OUT_CB_REG_SEQ(R500_RB3D_CONSTANT_COLOR_AR, 2);
+        OUT_CB(float_to_fixed10(color->color[0]) |
+               (float_to_fixed10(color->color[3]) << 16));
+        OUT_CB(float_to_fixed10(color->color[2]) |
+               (float_to_fixed10(color->color[1]) << 16));
+        END_CB;
+    } else {
+        union util_color uc;
+        util_pack_color(color->color, PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
 
-    /* XXX if FP16 blending is enabled, we should use the FP16 format */
-    state->blend_color_red_alpha =
-        float_to_fixed10(color->color[0]) |
-        (float_to_fixed10(color->color[3]) << 16);
-    state->blend_color_green_blue =
-        float_to_fixed10(color->color[2]) |
-        (float_to_fixed10(color->color[1]) << 16);
+        BEGIN_CB(state->cb, 2);
+        OUT_CB_REG(R300_RB3D_BLEND_COLOR, uc.ui);
+        END_CB;
+    }
 
-    r300->blend_color_state.size = r300->screen->caps.is_r500 ? 3 : 2;
     r300->blend_color_state.dirty = TRUE;
 }
 
@@ -389,18 +421,27 @@
                                 const struct pipe_clip_state* state)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_clip_state *clip =
+            (struct r300_clip_state*)r300->clip_state.state;
+    CB_LOCALS;
 
-    r300->clip = *state;
+    clip->clip = *state;
 
     if (r300->screen->caps.has_tcl) {
-        memcpy(r300->clip_state.state, state, sizeof(struct pipe_clip_state));
-        r300->clip_state.size = 29;
+        BEGIN_CB(clip->cb, 29);
+        OUT_CB_REG(R300_VAP_PVS_VECTOR_INDX_REG,
+                (r300->screen->caps.is_r500 ?
+                 R500_PVS_UCP_START : R300_PVS_UCP_START));
+        OUT_CB_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, 6 * 4);
+        OUT_CB_TABLE(state->ucp, 6 * 4);
+        OUT_CB_REG(R300_VAP_CLIP_CNTL, ((1 << state->nr) - 1) |
+                R300_PS_UCP_MODE_CLIP_AS_TRIFAN);
+        END_CB;
 
         r300->clip_state.dirty = TRUE;
     } else {
         draw_flush(r300->draw);
         draw_set_clip_state(r300->draw, state);
-        r300->clip_state.size = 2;
     }
 }
 
@@ -422,6 +463,9 @@
 {
     struct r300_capabilities *caps = &r300_screen(pipe->screen)->caps;
     struct r300_dsa_state* dsa = CALLOC_STRUCT(r300_dsa_state);
+    CB_LOCALS;
+
+    dsa->dsa = *state;
 
     /* Depth test setup. */
     if (state->depth.enabled) {
@@ -494,9 +538,43 @@
             dsa->alpha_function |= R500_FG_ALPHA_FUNC_8BIT;
     }
 
+    BEGIN_CB(&dsa->cb_begin, 8);
+    OUT_CB_REG(R300_FG_ALPHA_FUNC, dsa->alpha_function);
+    OUT_CB_REG_SEQ(R300_ZB_CNTL, 3);
+    OUT_CB(dsa->z_buffer_control);
+    OUT_CB(dsa->z_stencil_control);
+    OUT_CB(dsa->stencil_ref_mask);
+    OUT_CB_REG(R500_ZB_STENCILREFMASK_BF, dsa->stencil_ref_bf);
+    END_CB;
+
+    BEGIN_CB(dsa->cb_no_readwrite, 8);
+    OUT_CB_REG(R300_FG_ALPHA_FUNC, dsa->alpha_function);
+    OUT_CB_REG_SEQ(R300_ZB_CNTL, 3);
+    OUT_CB(0);
+    OUT_CB(0);
+    OUT_CB(0);
+    OUT_CB_REG(R500_ZB_STENCILREFMASK_BF, 0);
+    END_CB;
+
     return (void*)dsa;
 }
 
+static void r300_dsa_inject_stencilref(struct r300_context *r300)
+{
+    struct r300_dsa_state *dsa =
+            (struct r300_dsa_state*)r300->dsa_state.state;
+
+    if (!dsa)
+        return;
+
+    dsa->stencil_ref_mask =
+        (dsa->stencil_ref_mask & ~R300_STENCILREF_MASK) |
+        r300->stencil_ref.ref_value[0];
+    dsa->stencil_ref_bf =
+        (dsa->stencil_ref_bf & ~R300_STENCILREF_MASK) |
+        r300->stencil_ref.ref_value[1];
+}
+
 /* Bind DSA state. */
 static void r300_bind_dsa_state(struct pipe_context* pipe,
                                 void* state)
@@ -508,6 +586,8 @@
     }
 
     UPDATE_STATE(state, r300->dsa_state);
+
+    r300_dsa_inject_stencilref(r300);
 }
 
 /* Free DSA state. */
@@ -523,6 +603,8 @@
     struct r300_context* r300 = r300_context(pipe);
 
     r300->stencil_ref = *sr;
+
+    r300_dsa_inject_stencilref(r300);
     r300->dsa_state.dirty = TRUE;
 }
 
@@ -582,16 +664,11 @@
                                const struct pipe_framebuffer_state* state)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_aa_state *aa = (struct r300_aa_state*)r300->aa_state.state;
     struct pipe_framebuffer_state *old_state = r300->fb_state.state;
     unsigned max_width, max_height, i;
     uint32_t zbuffer_bpp = 0;
 
-    if (state->nr_cbufs > 4) {
-        fprintf(stderr, "r300: Implementation error: Too many MRTs in %s, "
-            "refusing to bind framebuffer state!\n", __FUNCTION__);
-        return;
-    }
-
     if (r300->screen->caps.is_r500) {
         max_width = max_height = 4096;
     } else if (r300->screen->caps.is_r400) {
@@ -610,6 +687,8 @@
         draw_flush(r300->draw);
     }
 
+    r300->gpu_flush.dirty = TRUE;
+    r300->aa_state.dirty = TRUE;
     r300->fb_state.dirty = TRUE;
 
     /* If nr_cbufs is changed from zero to non-zero or vice versa... */
@@ -626,8 +705,10 @@
 
     memcpy(r300->fb_state.state, state, sizeof(struct pipe_framebuffer_state));
 
-    r300->fb_state.size = (10 * state->nr_cbufs) + (2 * (4 - state->nr_cbufs)) +
-                          (state->zsbuf ? 10 : 0) + 9;
+    r300->fb_state.size =
+            7 +
+            (8 * state->nr_cbufs) +
+            (state->zsbuf ? (r300->screen->caps.has_hiz ? 22 : 18) : 0);
 
     /* Polygon offset depends on the zbuffer bit depth. */
     if (state->zsbuf && r300->polygon_offset_enabled) {
@@ -646,6 +727,30 @@
         }
     }
 
+    /* Set up AA config. */
+    if (r300->rws->get_value(r300->rws, R300_VID_DRM_2_3_0)) {
+        if (state->nr_cbufs && state->cbufs[0]->texture->nr_samples > 1) {
+            aa->aa_config = R300_GB_AA_CONFIG_AA_ENABLE;
+
+            switch (state->cbufs[0]->texture->nr_samples) {
+                case 2:
+                    aa->aa_config |= R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_2;
+                    break;
+                case 3:
+                    aa->aa_config |= R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_3;
+                    break;
+                case 4:
+                    aa->aa_config |= R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_4;
+                    break;
+                case 6:
+                    aa->aa_config |= R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_6;
+                    break;
+            }
+        } else {
+            aa->aa_config = 0;
+        }
+    }
+
     if (DBG_ON(r300, DBG_FB)) {
         fprintf(stderr, "r300: set_framebuffer_state:\n");
         for (i = 0; i < state->nr_cbufs; i++) {
@@ -679,13 +784,12 @@
     r300->fs.dirty = TRUE;
     r300->fs_rc_constant_state.dirty = TRUE;
     r300->fs_constants.dirty = TRUE;
+    r300->fs.size = fs->shader->cb_code_size;
 
     if (r300->screen->caps.is_r500) {
-        r300->fs.size = r500_get_fs_atom_size(r300);
         r300->fs_rc_constant_state.size = fs->shader->rc_state_count * 7;
         r300->fs_constants.size = fs->shader->externals_count * 4 + 3;
     } else {
-        r300->fs.size = r300_get_fs_atom_size(r300);
         r300->fs_rc_constant_state.size = fs->shader->rc_state_count * 5;
         r300->fs_constants.size = fs->shader->externals_count * 4 + 1;
     }
@@ -719,6 +823,7 @@
         tmp = ptr;
         ptr = ptr->next;
         rc_constants_destroy(&tmp->code.constants);
+        FREE(tmp->cb_code);
         FREE(tmp);
     }
     FREE((void*)fs->state.tokens);
@@ -870,6 +975,11 @@
         }
     }
 
+    if (state->gl_rasterization_rules) {
+        rs->multisample_position_0 = 0x66666666;
+        rs->multisample_position_1 = 0x6666666;
+    }
+
     return (void*)rs;
 }
 
@@ -899,7 +1009,8 @@
     }
 
     UPDATE_STATE(state, r300->rs_state);
-    r300->rs_state.size = 27 + (r300->polygon_offset_enabled ? 5 : 0);
+    r300->rs_state.size = 25 + (r300->polygon_offset_enabled ? 5 : 0) +
+        (r300->rws->get_value(r300->rws, R300_VID_DRM_2_3_0) ? 3 : 0);
 
     if (last_sprite_coord_enable != r300->sprite_coord_enable ||
         last_two_sided_color != r300->two_sided_color) {
@@ -925,10 +1036,34 @@
 
     sampler->state = *state;
 
+    /* r300 doesn't handle CLAMP and MIRROR_CLAMP correctly when either MAG
+     * or MIN filter is NEAREST. Since texwrap produces same results
+     * for CLAMP and CLAMP_TO_EDGE, we use them instead. */
+    if (sampler->state.min_img_filter == PIPE_TEX_FILTER_NEAREST ||
+        sampler->state.mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
+        /* Wrap S. */
+        if (sampler->state.wrap_s == PIPE_TEX_WRAP_CLAMP)
+            sampler->state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+        else if (sampler->state.wrap_s == PIPE_TEX_WRAP_MIRROR_CLAMP)
+            sampler->state.wrap_s = PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE;
+
+        /* Wrap T. */
+        if (sampler->state.wrap_t == PIPE_TEX_WRAP_CLAMP)
+            sampler->state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+        else if (sampler->state.wrap_t == PIPE_TEX_WRAP_MIRROR_CLAMP)
+            sampler->state.wrap_t = PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE;
+
+        /* Wrap R. */
+        if (sampler->state.wrap_r == PIPE_TEX_WRAP_CLAMP)
+            sampler->state.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+        else if (sampler->state.wrap_r == PIPE_TEX_WRAP_MIRROR_CLAMP)
+            sampler->state.wrap_r = PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE;
+    }
+
     sampler->filter0 |=
-        (r300_translate_wrap(state->wrap_s) << R300_TX_WRAP_S_SHIFT) |
-        (r300_translate_wrap(state->wrap_t) << R300_TX_WRAP_T_SHIFT) |
-        (r300_translate_wrap(state->wrap_r) << R300_TX_WRAP_R_SHIFT);
+        (r300_translate_wrap(sampler->state.wrap_s) << R300_TX_WRAP_S_SHIFT) |
+        (r300_translate_wrap(sampler->state.wrap_t) << R300_TX_WRAP_T_SHIFT) |
+        (r300_translate_wrap(sampler->state.wrap_r) << R300_TX_WRAP_R_SHIFT);
 
     sampler->filter0 |= r300_translate_tex_filters(state->min_img_filter,
                                                    state->mag_img_filter,
@@ -995,6 +1130,31 @@
     FREE(state);
 }
 
+static uint32_t r300_assign_texture_cache_region(unsigned index, unsigned num)
+{
+    /* This looks like a hack, but I believe it's suppose to work like
+     * that. To illustrate how this works, let's assume you have 5 textures.
+     * From docs, 5 and the successive numbers are:
+     *
+     * FOURTH_1     = 5
+     * FOURTH_2     = 6
+     * FOURTH_3     = 7
+     * EIGHTH_0     = 8
+     * EIGHTH_1     = 9
+     *
+     * First 3 textures will get 3/4 of size of the cache, divived evenly
+     * between them. The last 1/4 of the cache must be divided between
+     * the last 2 textures, each will therefore get 1/8 of the cache.
+     * Why not just to use "5 + texture_index" ?
+     *
+     * This simple trick works for all "num" <= 16.
+     */
+    if (num <= 1)
+        return R300_TX_CACHE(R300_TX_CACHE_WHOLE);
+    else
+        return R300_TX_CACHE(num + index);
+}
+
 static void r300_set_fragment_sampler_views(struct pipe_context* pipe,
                                             unsigned count,
                                             struct pipe_sampler_view** views)
@@ -1003,7 +1163,7 @@
     struct r300_textures_state* state =
         (struct r300_textures_state*)r300->textures_state.state;
     struct r300_texture *texture;
-    unsigned i;
+    unsigned i, real_num_views = 0, view_index = 0;
     unsigned tex_units = r300->screen->caps.num_tex_units;
     boolean dirty_tex = FALSE;
 
@@ -1011,6 +1171,12 @@
         return;
     }
 
+    /* Calculate the real number of views. */
+    for (i = 0; i < count; i++) {
+        if (views[i])
+            real_num_views++;
+    }
+
     for (i = 0; i < count; i++) {
         if (&state->sampler_views[i]->base != views[i]) {
             pipe_sampler_view_reference(
@@ -1030,6 +1196,10 @@
             if (texture->uses_pitch) {
                 r300->fs_rc_constant_state.dirty = TRUE;
             }
+
+            state->sampler_views[i]->texcache_region =
+                r300_assign_texture_cache_region(view_index, real_num_views);
+            view_index++;
         }
     }
 
@@ -1296,7 +1466,6 @@
     if (velems != NULL) {
         velems->count = count;
         memcpy(velems->velem, attribs, sizeof(struct pipe_vertex_element) * count);
-        velems->incompatible_layout = FALSE;
 
         if (r300_screen(pipe->screen)->caps.has_tcl) {
             /* Set the best hw format in case the original format is not
@@ -1354,11 +1523,13 @@
 
             /* Align the formats to the size of DWORD.
              * We only care about the blocksizes of the formats since
-             * swizzles are already set up. */
+             * swizzles are already set up.
+             * Also compute the vertex size. */
             for (i = 0; i < count; i++) {
                 /* This is OK because we check for aligned strides too. */
                 velems->hw_format_size[i] =
                     align(util_format_get_blocksize(velems->hw_format[i]), 4);
+                velems->vertex_size_dwords += velems->hw_format_size[i] / 4;
             }
         }
     }
@@ -1433,7 +1604,7 @@
     if (r300->screen->caps.has_tcl) {
         r300->vs_state.dirty = TRUE;
         r300->vs_state.size =
-                vs->code.length + 9 +
+                vs->code.length + 18 +
                 (vs->immediates_count ? vs->immediates_count * 4 + 3 : 0);
 
         if (vs->externals_count) {
@@ -1474,7 +1645,7 @@
     struct r300_context* r300 = r300_context(pipe);
     struct r300_constant_buffer *cbuf;
     struct pipe_transfer *tr;
-    void *mapped;
+    float *mapped;
     int max_size = 0, max_size_bytes = 0, clamped_size = 0;
 
     switch (shader) {
@@ -1513,10 +1684,20 @@
             fprintf(stderr, "r300: Max size of the constant buffer is "
                           "%i*4 floats.\n", max_size);
         }
-        clamped_size = MIN2(buf->width0, max_size_bytes);
 
-        memcpy(cbuf->constants, mapped, clamped_size);
+        clamped_size = MIN2(buf->width0, max_size_bytes);
         cbuf->count = clamped_size / (4 * sizeof(float));
+
+        if (shader == PIPE_SHADER_FRAGMENT && !r300->screen->caps.is_r500) {
+            unsigned i,j;
+
+            /* Convert constants to float24. */
+            for (i = 0; i < cbuf->count; i++)
+                for (j = 0; j < 4; j++)
+                    cbuf->constants[i][j] = pack_float24(mapped[i*4+j]);
+        } else {
+            memcpy(cbuf->constants, mapped, clamped_size);
+        }
     }
 
     if (shader == PIPE_SHADER_VERTEX) {
diff --git a/src/gallium/drivers/r300/r300_state.h b/src/gallium/drivers/r300/r300_state.h
deleted file mode 100644
index 1d55750..0000000
--- a/src/gallium/drivers/r300/r300_state.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright 2008 Marek Olšák <maraeo@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE. */
-
-#ifndef R300_STATE_H
-#define R300_STATE_H
-
-struct r300_context;
-
-void r300_mark_fs_code_dirty(struct r300_context *r300);
-
-#endif /* R300_STATE_H */
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index cc75fad..3aa8deb 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -31,7 +31,6 @@
 #include "r300_hyperz.h"
 #include "r300_screen.h"
 #include "r300_shader_semantics.h"
-#include "r300_state.h"
 #include "r300_state_derived.h"
 #include "r300_state_inlines.h"
 #include "r300_texture.h"
@@ -537,6 +536,10 @@
         UTIL_FORMAT_SWIZZLE_X
     };
 
+    /* The KIL opcode fix, see below. */
+    if (!count && !r300->screen->caps.is_r500)
+        count = 1;
+
     state->tx_enable = 0;
     state->count = 0;
     size = 2;
@@ -555,6 +558,9 @@
             texstate->filter1 = sampler->filter1;
             texstate->border_color = sampler->border_color;
 
+            /* Assign a texture cache region. */
+            texstate->format.format1 |= view->texcache_region;
+
             /* If compare mode is disabled, the sampler view swizzles
              * are stored in the format.
              * Otherwise, swizzles must be applied after the compare mode
@@ -613,6 +619,36 @@
 
             size += 16;
             state->count = i+1;
+        } else {
+            /* For the KIL opcode to work on r3xx-r4xx, the texture unit
+             * assigned to this opcode (it's always the first one) must be
+             * enabled. Otherwise the opcode doesn't work.
+             *
+             * In order to not depend on the fragment shader, we just make
+             * the first unit enabled all the time. */
+            if (i == 0 && !r300->screen->caps.is_r500) {
+                pipe_sampler_view_reference(
+                        (struct pipe_sampler_view**)&state->sampler_views[i],
+                        &r300->texkill_sampler->base);
+
+                state->tx_enable |= 1 << i;
+
+                texstate = &state->regs[i];
+
+                /* Just set some valid state. */
+                texstate->format = r300->texkill_sampler->format;
+                texstate->filter0 =
+                        r300_translate_tex_filters(PIPE_TEX_FILTER_NEAREST,
+                                                   PIPE_TEX_FILTER_NEAREST,
+                                                   PIPE_TEX_FILTER_NEAREST,
+                                                   FALSE);
+                texstate->filter1 = 0;
+                texstate->border_color = 0;
+
+                texstate->filter0 |= i << 28;
+                size += 16;
+                state->count = i+1;
+            }
         }
     }
 
diff --git a/src/gallium/drivers/r300/r300_state_invariant.c b/src/gallium/drivers/r300/r300_state_invariant.c
index 34d3a16..acd2097 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.c
+++ b/src/gallium/drivers/r300/r300_state_invariant.c
@@ -38,79 +38,22 @@
 {
     CS_LOCALS(r300);
 
-    if (r300->rws->get_value(r300->rws, R300_VID_DRM_2_3_0)) {
-        /* Subpixel multisampling for AA. */
-        BEGIN_CS(4);
-        OUT_CS_REG(R300_GB_MSPOS0, 0x66666666);
-        OUT_CS_REG(R300_GB_MSPOS1, 0x6666666);
-        END_CS;
-    }
+    BEGIN_CS(18 + (r300->screen->caps.is_rv350 ? 4 : 0));
 
-    BEGIN_CS(12 + (r300->screen->caps.has_tcl ? 2 : 0));
-
-    /*** Graphics Backend (GB) ***/
-    /* Source of fog depth */
-    OUT_CS_REG(R300_GB_SELECT, R300_GB_FOG_SELECT_1_1_W);
-
-    /*** Fog (FG) ***/
-    OUT_CS_REG(R300_FG_FOG_BLEND, 0x0);
-    OUT_CS_REG(R300_FG_FOG_COLOR_R, 0x0);
-    OUT_CS_REG(R300_FG_FOG_COLOR_G, 0x0);
-    OUT_CS_REG(R300_FG_FOG_COLOR_B, 0x0);
-
-    /*** VAP ***/
-    /* Sign/normalize control */
-    OUT_CS_REG(R300_VAP_PSC_SGN_NORM_CNTL, R300_SGN_NORM_NO_ZERO);
-    /* TCL-only stuff */
-    if (r300->screen->caps.has_tcl) {
-        /* Amount of time to wait for vertex fetches in PVS */
-        OUT_CS_REG(VAP_PVS_VTX_TIMEOUT_REG, 0xffff);
-    }
-
-    END_CS;
-
-    /* XXX unsorted stuff from surface_fill */
-    BEGIN_CS(38 + (r300->screen->caps.has_tcl ? 7 : 0) +
-             (r300->screen->caps.is_rv350 ? 4 : 0));
-
-    if (r300->screen->caps.has_tcl) {
-        /*Flushing PVS is required before the VAP_GB registers can be changed*/
-        OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0);
-        OUT_CS_REG_SEQ(R300_VAP_GB_VERT_CLIP_ADJ, 4);
-        OUT_CS_32F(1.0);
-        OUT_CS_32F(1.0);
-        OUT_CS_32F(1.0);
-        OUT_CS_32F(1.0);
-    }
-    /* XXX line tex stuffing */
-    OUT_CS_REG_SEQ(R300_GA_LINE_S0, 1);
-    OUT_CS_32F(0.0);
-    OUT_CS_REG_SEQ(R300_GA_LINE_S1, 1);
-    OUT_CS_32F(1.0);
-    OUT_CS_REG(R300_GA_TRIANGLE_STIPPLE, 0x5 |
-        (0x5 << R300_GA_TRIANGLE_STIPPLE_Y_SHIFT_SHIFT));
-    /* XXX this big chunk should be refactored into rs_state */
-    OUT_CS_REG(R300_GA_SOLID_RG, 0x00000000);
-    OUT_CS_REG(R300_GA_SOLID_BA, 0x00000000);
-    OUT_CS_REG(R300_GA_ROUND_MODE, 0x00000001);
-    OUT_CS_REG(R300_GA_OFFSET, 0x00000000);
-    OUT_CS_REG(R300_GA_FOG_SCALE, 0x3DBF1412);
-    OUT_CS_REG(R300_GA_FOG_OFFSET, 0x00000000);
-    OUT_CS_REG(R300_SU_TEX_WRAP, 0x00000000);
+    OUT_CS_REG(R300_GB_SELECT, 0);
+    OUT_CS_REG(R300_FG_FOG_BLEND, 0);
+    OUT_CS_REG(R300_GA_ROUND_MODE, 1);
+    OUT_CS_REG(R300_GA_OFFSET, 0);
+    OUT_CS_REG(R300_SU_TEX_WRAP, 0);
     OUT_CS_REG(R300_SU_DEPTH_SCALE, 0x4B7FFFFF);
-    OUT_CS_REG(R300_SU_DEPTH_OFFSET, 0x00000000);
-    OUT_CS_REG(R300_SC_HYPERZ, 0x0000001C);
+    OUT_CS_REG(R300_SU_DEPTH_OFFSET, 0);
+    OUT_CS_REG(R300_SC_HYPERZ, 0x1C);
     OUT_CS_REG(R300_SC_EDGERULE, 0x2DA49525);
-    OUT_CS_REG(R300_RB3D_AARESOLVE_CTL, 0x00000000);
 
     if (r300->screen->caps.is_rv350) {
         OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 0x01010101);
         OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD, 0xFEFEFEFE);
     }
 
-    OUT_CS_REG(R300_ZB_BW_CNTL, 0x00000000);
-    OUT_CS_REG(R300_ZB_DEPTHCLEARVALUE, 0x00000000);
-    OUT_CS_REG(R300_ZB_HIZ_OFFSET, 0x00000000);
-    OUT_CS_REG(R300_ZB_HIZ_PITCH, 0x00000000);
     END_CS;
 }
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 4ca8ce0..ddb6600 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -21,30 +21,26 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
-#include "pipe/p_screen.h"
+/* Always include headers in the reverse order!! ~ M. */
+#include "r300_texture.h"
+
+#include "r300_context.h"
+#include "r300_reg.h"
+#include "r300_transfer.h"
+#include "r300_screen.h"
+#include "r300_winsys.h"
 
 #include "util/u_format.h"
 #include "util/u_format_s3tc.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
-#include "r300_context.h"
-#include "r300_reg.h"
-#include "r300_texture.h"
-#include "r300_transfer.h"
-#include "r300_screen.h"
-#include "r300_winsys.h"
+#include "pipe/p_screen.h"
+#include "state_tracker/drm_api.h"
 
-#define TILE_WIDTH 0
-#define TILE_HEIGHT 1
-
-static const unsigned microblock_table[5][3][2] = {
-    /*linear  tiled   square-tiled */
-    {{32, 1}, {8, 4}, {0, 0}}, /*   8 bits per pixel */
-    {{16, 1}, {8, 2}, {4, 4}}, /*  16 bits per pixel */
-    {{ 8, 1}, {4, 2}, {0, 0}}, /*  32 bits per pixel */
-    {{ 4, 1}, {0, 0}, {2, 2}}, /*  64 bits per pixel */
-    {{ 2, 1}, {0, 0}, {0, 0}}  /* 128 bits per pixel */
+enum r300_dim {
+    DIM_WIDTH  = 0,
+    DIM_HEIGHT = 1
 };
 
 unsigned r300_get_swizzle_combined(const unsigned char *swizzle_format,
@@ -596,21 +592,21 @@
     /* Set framebuffer state. */
     if (util_format_is_depth_or_stencil(tex->b.b.format)) {
         for (i = 0; i <= tex->b.b.last_level; i++) {
-            tex->fb_state.depthpitch[i] =
+            tex->fb_state.pitch[i] =
                 tex->hwpitch[i] |
                 R300_DEPTHMACROTILE(tex->mip_macrotile[i]) |
                 R300_DEPTHMICROTILE(tex->microtile);
         }
-        tex->fb_state.zb_format = r300_translate_zsformat(tex->b.b.format);
+        tex->fb_state.format = r300_translate_zsformat(tex->b.b.format);
     } else {
         for (i = 0; i <= tex->b.b.last_level; i++) {
-            tex->fb_state.colorpitch[i] =
+            tex->fb_state.pitch[i] =
                 tex->hwpitch[i] |
                 r300_translate_colorformat(tex->b.b.format) |
                 R300_COLOR_TILE(tex->mip_macrotile[i]) |
                 R300_COLOR_MICROTILE(tex->microtile);
         }
-        tex->fb_state.us_out_fmt = r300_translate_out_fmt(tex->b.b.format);
+        tex->fb_state.format = r300_translate_out_fmt(tex->b.b.format);
     }
 }
 
@@ -620,8 +616,10 @@
 {
     struct r300_screen *r300screen = r300_screen(screen);
 
-    SCREEN_DBG(r300screen, DBG_TEX, "r300: texture_reinterpret_format: %s -> %s\n",
-               util_format_short_name(tex->format), util_format_short_name(new_format));
+    SCREEN_DBG(r300screen, DBG_TEX,
+        "r300: texture_reinterpret_format: %s -> %s\n",
+        util_format_short_name(tex->format),
+        util_format_short_name(new_format));
 
     tex->format = new_format;
 
@@ -648,36 +646,65 @@
     }
 }
 
-/**
- * Return the width (dim==TILE_WIDTH) or height (dim==TILE_HEIGHT) of one tile
- * of the given texture.
- */
-static unsigned r300_texture_get_tile_size(struct r300_texture* tex,
-                                           int dim, boolean macrotile)
+/* Returns the number of pixels that the texture should be aligned to
+ * in the given dimension. */
+static unsigned r300_get_pixel_alignment(struct r300_texture *tex,
+                                         enum r300_buffer_tiling macrotile,
+                                         enum r300_dim dim)
 {
-    unsigned pixsize, tile_size;
+    static const unsigned table[2][5][3][2] =
+    {
+        {
+    /* Macro: linear    linear    linear
+       Micro: linear    tiled  square-tiled */
+            {{ 32, 1}, { 8,  4}, { 0,  0}}, /*   8 bits per pixel */
+            {{ 16, 1}, { 8,  2}, { 4,  4}}, /*  16 bits per pixel */
+            {{  8, 1}, { 4,  2}, { 0,  0}}, /*  32 bits per pixel */
+            {{  4, 1}, { 0,  0}, { 2,  2}}, /*  64 bits per pixel */
+            {{  2, 1}, { 0,  0}, { 0,  0}}  /* 128 bits per pixel */
+        },
+        {
+    /* Macro: tiled     tiled     tiled
+       Micro: linear    tiled  square-tiled */
+            {{256, 8}, {64, 32}, { 0,  0}}, /*   8 bits per pixel */
+            {{128, 8}, {64, 16}, {32, 32}}, /*  16 bits per pixel */
+            {{ 64, 8}, {32, 16}, { 0,  0}}, /*  32 bits per pixel */
+            {{ 32, 8}, { 0,  0}, {16, 16}}, /*  64 bits per pixel */
+            {{ 16, 8}, { 0,  0}, { 0,  0}}  /* 128 bits per pixel */
+        }
+    };
+    static const unsigned aa_block[2] = {4, 8};
+    unsigned res = 0;
+    unsigned pixsize = util_format_get_blocksize(tex->b.b.format);
 
-    pixsize = util_format_get_blocksize(tex->b.b.format);
-    tile_size = microblock_table[util_logbase2(pixsize)][tex->microtile][dim];
+    assert(macrotile <= R300_BUFFER_TILED);
+    assert(tex->microtile <= R300_BUFFER_SQUARETILED);
+    assert(pixsize <= 16);
+    assert(dim <= DIM_HEIGHT);
 
-    if (macrotile) {
-        tile_size *= 8;
+    if (tex->b.b.nr_samples > 1) {
+        /* Multisampled textures have their own alignment scheme. */
+        if (pixsize == 4)
+            res = aa_block[dim];
+    } else {
+        /* Standard alignment. */
+        res = table[macrotile][util_logbase2(pixsize)][tex->microtile][dim];
     }
 
-    assert(tile_size);
-    return tile_size;
+    assert(res);
+    return res;
 }
 
 /* Return true if macrotiling should be enabled on the miplevel. */
 static boolean r300_texture_macro_switch(struct r300_texture *tex,
                                          unsigned level,
                                          boolean rv350_mode,
-                                         int dim)
+                                         enum r300_dim dim)
 {
     unsigned tile, texdim;
 
-    tile = r300_texture_get_tile_size(tex, dim, TRUE);
-    if (dim == TILE_WIDTH) {
+    tile = r300_get_pixel_alignment(tex, R300_BUFFER_TILED, dim);
+    if (dim == DIM_WIDTH) {
         texdim = u_minify(tex->b.b.width0, level);
     } else {
         texdim = u_minify(tex->b.b.height0, level);
@@ -713,8 +740,8 @@
     width = u_minify(tex->b.b.width0, level);
 
     if (util_format_is_plain(tex->b.b.format)) {
-        tile_width = r300_texture_get_tile_size(tex, TILE_WIDTH,
-                                                tex->mip_macrotile[level]);
+        tile_width = r300_get_pixel_alignment(tex, tex->mip_macrotile[level],
+                                              DIM_WIDTH);
         width = align(width, tile_width);
 
         stride = util_format_get_stride(tex->b.b.format, width);
@@ -743,8 +770,8 @@
     height = u_minify(tex->b.b.height0, level);
 
     if (util_format_is_plain(tex->b.b.format)) {
-        tile_height = r300_texture_get_tile_size(tex, TILE_HEIGHT,
-                                                 tex->mip_macrotile[level]);
+        tile_height = r300_get_pixel_alignment(tex, tex->mip_macrotile[level],
+                                               DIM_HEIGHT);
         height = align(height, tile_height);
 
         /* This is needed for the kernel checker, unfortunately. */
@@ -784,21 +811,26 @@
     unsigned stride, size, layer_size, nblocksy, i;
     boolean rv350_mode = screen->caps.is_rv350;
 
-    SCREEN_DBG(screen, DBG_TEXALLOC, "r300: Making miptree for texture, format %s\n",
-               util_format_short_name(base->format));
+    SCREEN_DBG(screen, DBG_TEXALLOC,
+        "r300: Making miptree for texture, format %s\n",
+        util_format_short_name(base->format));
 
     for (i = 0; i <= base->last_level; i++) {
         /* Let's see if this miplevel can be macrotiled. */
         tex->mip_macrotile[i] =
             (tex->macrotile == R300_BUFFER_TILED &&
-             r300_texture_macro_switch(tex, i, rv350_mode, TILE_WIDTH) &&
-             r300_texture_macro_switch(tex, i, rv350_mode, TILE_HEIGHT)) ?
+             r300_texture_macro_switch(tex, i, rv350_mode, DIM_WIDTH) &&
+             r300_texture_macro_switch(tex, i, rv350_mode, DIM_HEIGHT)) ?
              R300_BUFFER_TILED : R300_BUFFER_LINEAR;
 
         stride = r300_texture_get_stride(screen, tex, i);
         nblocksy = r300_texture_get_nblocksy(tex, i);
         layer_size = stride * nblocksy;
 
+        if (base->nr_samples) {
+            layer_size *= base->nr_samples;
+        }
+
         if (base->target == PIPE_TEXTURE_CUBE)
             size = layer_size * 6;
         else
@@ -864,8 +896,8 @@
     }
 
     /* Set macrotiling. */
-    if (r300_texture_macro_switch(tex, 0, rv350_mode, TILE_WIDTH) &&
-        r300_texture_macro_switch(tex, 0, rv350_mode, TILE_HEIGHT)) {
+    if (r300_texture_macro_switch(tex, 0, rv350_mode, DIM_WIDTH) &&
+        r300_texture_macro_switch(tex, 0, rv350_mode, DIM_HEIGHT)) {
         tex->macrotile = R300_BUFFER_TILED;
     }
 }
@@ -899,17 +931,14 @@
 {
     struct r300_winsys_screen *rws = (struct r300_winsys_screen *)screen->winsys;
     struct r300_texture* tex = (struct r300_texture*)texture;
-    unsigned stride;
 
     if (!tex) {
         return FALSE;
     }
 
-    stride = r300_texture_get_stride(r300_screen(screen), tex, 0);
+    whandle->stride = r300_texture_get_stride(r300_screen(screen), tex, 0);
 
-    rws->buffer_get_handle(rws, tex->buffer, stride, whandle);
-
-    return TRUE;
+    return rws->buffer_get_handle(rws, tex->buffer, whandle);
 }
 
 struct u_resource_vtbl r300_texture_vtbl = 
@@ -1003,26 +1032,27 @@
 					  unsigned flags)
 {
     struct r300_texture* tex = r300_texture(texture);
-    struct pipe_surface* surface = CALLOC_STRUCT(pipe_surface);
-    unsigned offset;
-
-    offset = r300_texture_get_offset(tex, level, zslice, face);
+    struct r300_surface* surface = CALLOC_STRUCT(r300_surface);
 
     if (surface) {
-        pipe_reference_init(&surface->reference, 1);
-        pipe_resource_reference(&surface->texture, texture);
-        surface->format = texture->format;
-        surface->width = u_minify(texture->width0, level);
-        surface->height = u_minify(texture->height0, level);
-        surface->offset = offset;
-        surface->usage = flags;
-        surface->zslice = zslice;
-        surface->texture = texture;
-        surface->face = face;
-        surface->level = level;
+        pipe_reference_init(&surface->base.reference, 1);
+        pipe_resource_reference(&surface->base.texture, texture);
+        surface->base.format = texture->format;
+        surface->base.width = u_minify(texture->width0, level);
+        surface->base.height = u_minify(texture->height0, level);
+        surface->base.usage = flags;
+        surface->base.zslice = zslice;
+        surface->base.face = face;
+        surface->base.level = level;
+
+        surface->buffer = tex->buffer;
+        surface->domain = tex->domain;
+        surface->offset = r300_texture_get_offset(tex, level, zslice, face);
+        surface->pitch = tex->fb_state.pitch[level];
+        surface->format = tex->fb_state.format;
     }
 
-    return surface;
+    return &surface->base;
 }
 
 /* Not required to implement u_resource_vtbl, consider moving to another file:
@@ -1042,7 +1072,6 @@
     struct r300_screen* rscreen = r300_screen(screen);
     struct r300_winsys_buffer *buffer;
     struct r300_texture* tex;
-    unsigned stride;
     boolean override_zb_flags;
 
     /* Support only 2D textures without mipmaps */
@@ -1052,7 +1081,7 @@
         return NULL;
     }
 
-    buffer = rws->buffer_from_handle(rws, screen, whandle, &stride);
+    buffer = rws->buffer_from_handle(rws, whandle->handle);
     if (!buffer) {
         return NULL;
     }
@@ -1068,7 +1097,7 @@
     tex->b.b.screen = screen;
     tex->domain = R300_DOMAIN_VRAM;
 
-    tex->stride_override = stride;
+    tex->stride_override = whandle->stride;
 
     /* one ref already taken */
     tex->buffer = buffer;
@@ -1080,7 +1109,7 @@
                "Pitch: % 4i, Dim: %ix%i, Format: %s\n",
                tex->macrotile ? "YES" : " NO",
                tex->microtile ? "YES" : " NO",
-               stride / util_format_get_blocksize(base->format),
+               whandle->stride / util_format_get_blocksize(base->format),
                base->width0, base->height0,
                util_format_short_name(base->format));
 
diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h
index ff640c5..99e7694 100644
--- a/src/gallium/drivers/r300/r300_texture.h
+++ b/src/gallium/drivers/r300/r300_texture.h
@@ -23,8 +23,11 @@
 #ifndef R300_TEXTURE_H
 #define R300_TEXTURE_H
 
-#include "util/u_format.h"
+#include "pipe/p_format.h"
 
+struct pipe_screen;
+struct pipe_resource;
+struct winsys_handle;
 struct r300_texture;
 struct r300_screen;
 
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
index 89f39af..5394e04 100644
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -105,12 +105,12 @@
      /* case TGSI_OPCODE_DIV: return RC_OPCODE_DIV; */
      /* case TGSI_OPCODE_DP2: return RC_OPCODE_DP2; */
         case TGSI_OPCODE_TXL: return RC_OPCODE_TXL;
-     /* case TGSI_OPCODE_BRK: return RC_OPCODE_BRK; */
+        case TGSI_OPCODE_BRK: return RC_OPCODE_BRK;
         case TGSI_OPCODE_IF: return RC_OPCODE_IF;
-     /* case TGSI_OPCODE_LOOP: return RC_OPCODE_LOOP; */
+        case TGSI_OPCODE_BGNLOOP: return RC_OPCODE_BGNLOOP;
         case TGSI_OPCODE_ELSE: return RC_OPCODE_ELSE;
         case TGSI_OPCODE_ENDIF: return RC_OPCODE_ENDIF;
-     /* case TGSI_OPCODE_ENDLOOP: return RC_OPCODE_ENDLOOP; */
+        case TGSI_OPCODE_ENDLOOP: return RC_OPCODE_ENDLOOP;
      /* case TGSI_OPCODE_PUSHA: return RC_OPCODE_PUSHA; */
      /* case TGSI_OPCODE_POPA: return RC_OPCODE_POPA; */
         case TGSI_OPCODE_CEIL: return RC_OPCODE_CEIL;
diff --git a/src/gallium/drivers/r300/r300_transfer.c b/src/gallium/drivers/r300/r300_transfer.c
index 4f37fab..02421a5 100644
--- a/src/gallium/drivers/r300/r300_transfer.c
+++ b/src/gallium/drivers/r300/r300_transfer.c
@@ -57,22 +57,11 @@
     subdst.face = 0;
     subdst.level = 0;
 
-    /* XXX if we don't flush before copying the texture and mapping it,
-     * we get wrong pixels, i.e. it's like latest draw calls didn't happen,
-     * including this blit. Tests: e.g. piglit/provoking-vertex
-     *
-     * Since the flush immediately before mapping is implicit (the buffer is
-     * always referenced in resource_copy_region), every read transfer costs
-     * 2 flushes. That sucks. */
-    ctx->flush(ctx, 0, NULL);
-
     ctx->resource_copy_region(ctx, &r300transfer->detiled_texture->b.b, subdst,
 			      0, 0, 0,
 			      tex, transfer->sr,
 			      transfer->box.x, transfer->box.y, transfer->box.z,
 			      transfer->box.width, transfer->box.height);
-
-    /* Flushing after the copy is implicit, issued by winsys. */
 }
 
 /* Copy a detiled texture to a tiled one. */
@@ -92,7 +81,6 @@
 			      0, 0, 0,
 			      transfer->box.width, transfer->box.height);
 
-    /* XXX this flush fixes a few piglit tests (e.g. glean/pixelFormats). */
     ctx->flush(ctx, 0, NULL);
 }
 
diff --git a/src/gallium/drivers/r300/r300_winsys.h b/src/gallium/drivers/r300/r300_winsys.h
index 6ce2189..77c1c13 100644
--- a/src/gallium/drivers/r300/r300_winsys.h
+++ b/src/gallium/drivers/r300/r300_winsys.h
@@ -47,13 +47,6 @@
     R300_REF_HW = 2
 };
 
-struct r300_cs_info {
-    /* In DWORDs. */
-    unsigned used;
-    unsigned free;
-    unsigned capacity;
-};
-
 struct r300_winsys_screen {
     void (*destroy)(struct r300_winsys_screen *ws);
     
@@ -109,16 +102,13 @@
      * Returns TRUE if a flush is required. */
     boolean (*validate)(struct r300_winsys_screen* winsys);
 
-    /* Return current CS info. */
-    void (*get_cs_info)(struct r300_winsys_screen *winsys,
-                        struct r300_cs_info *info);
+    /* Return the number of free dwords in CS. */
+    unsigned (*get_cs_free_dwords)(struct r300_winsys_screen *winsys);
 
-    /* Start a command emit. */
-    void (*begin_cs)(struct r300_winsys_screen* winsys,
-                     int size,
-                     const char* file,
-                     const char* function,
-                     int line);
+    /* Return the pointer to the first free dword in CS and assume a pipe
+     * driver wants to fill "count" dwords. */
+    uint32_t *(*get_cs_pointer)(struct r300_winsys_screen *winsys,
+                                unsigned count);
 
     /* Write a dword to the command buffer. */
     void (*write_cs_dword)(struct r300_winsys_screen* winsys, uint32_t dword);
@@ -134,12 +124,6 @@
                            enum r300_buffer_domain wd,
                            uint32_t flags);
 
-    /* Finish a command emit. */
-    void (*end_cs)(struct r300_winsys_screen* winsys,
-                   const char* file,
-                   const char* function,
-                   int line);
-
     /* Flush the CS. */
     void (*flush_cs)(struct r300_winsys_screen* winsys);
 
@@ -164,12 +148,10 @@
 			  enum r300_value_id vid);
 
     struct r300_winsys_buffer *(*buffer_from_handle)(struct r300_winsys_screen *winsys,
-						     struct pipe_screen *screen,
-						     struct winsys_handle *whandle,
-						     unsigned *stride);
+                                                     unsigned handle);
+
     boolean (*buffer_get_handle)(struct r300_winsys_screen *winsys,
 				 struct r300_winsys_buffer *buffer,
-				 unsigned stride,
 				 struct winsys_handle *whandle);
 
     boolean (*is_buffer_referenced)(struct r300_winsys_screen *winsys,
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index 401a28a..12ef98a 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -114,6 +114,11 @@
       pipe_sampler_view_reference(&softpipe->vertex_sampler_views[i], NULL);
    }
 
+   for (i = 0; i < PIPE_MAX_GEOMETRY_SAMPLERS; i++) {
+      sp_destroy_tex_tile_cache(softpipe->geometry_tex_cache[i]);
+      pipe_sampler_view_reference(&softpipe->geometry_sampler_views[i], NULL);
+   }
+
    for (i = 0; i < PIPE_SHADER_TYPES; i++) {
       uint j;
 
@@ -174,7 +179,12 @@
           softpipe->vertex_tex_cache[i]->texture == texture)
          return PIPE_REFERENCED_FOR_READ;
    }
-   
+   for (i = 0; i < PIPE_MAX_GEOMETRY_SAMPLERS; i++) {
+      if (softpipe->geometry_tex_cache[i] &&
+          softpipe->geometry_tex_cache[i]->texture == texture)
+         return PIPE_REFERENCED_FOR_READ;
+   }
+
    return PIPE_UNREFERENCED;
 }
 
@@ -225,6 +235,7 @@
    softpipe->pipe.create_sampler_state = softpipe_create_sampler_state;
    softpipe->pipe.bind_fragment_sampler_states  = softpipe_bind_sampler_states;
    softpipe->pipe.bind_vertex_sampler_states = softpipe_bind_vertex_sampler_states;
+   softpipe->pipe.bind_geometry_sampler_states = softpipe_bind_geometry_sampler_states;
    softpipe->pipe.delete_sampler_state = softpipe_delete_sampler_state;
 
    softpipe->pipe.create_depth_stencil_alpha_state = softpipe_create_depth_stencil_state;
@@ -265,6 +276,7 @@
    softpipe->pipe.set_scissor_state = softpipe_set_scissor_state;
    softpipe->pipe.set_fragment_sampler_views = softpipe_set_sampler_views;
    softpipe->pipe.set_vertex_sampler_views = softpipe_set_vertex_sampler_views;
+   softpipe->pipe.set_geometry_sampler_views = softpipe_set_geometry_sampler_views;
    softpipe->pipe.create_sampler_view = softpipe_create_sampler_view;
    softpipe->pipe.sampler_view_destroy = softpipe_sampler_view_destroy;
    softpipe->pipe.set_viewport_state = softpipe_set_viewport_state;
@@ -301,6 +313,9 @@
    for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
       softpipe->vertex_tex_cache[i] = sp_create_tex_tile_cache( &softpipe->pipe );
    }
+   for (i = 0; i < PIPE_MAX_GEOMETRY_SAMPLERS; i++) {
+      softpipe->geometry_tex_cache[i] = sp_create_tex_tile_cache( &softpipe->pipe );
+   }
 
    softpipe->fs_machine = tgsi_exec_machine_create();
 
@@ -319,10 +334,17 @@
       goto fail;
 
    draw_texture_samplers(softpipe->draw,
+                         PIPE_SHADER_VERTEX,
                          PIPE_MAX_VERTEX_SAMPLERS,
                          (struct tgsi_sampler **)
                             softpipe->tgsi.vert_samplers_list);
 
+   draw_texture_samplers(softpipe->draw,
+                         PIPE_SHADER_GEOMETRY,
+                         PIPE_MAX_GEOMETRY_SAMPLERS,
+                         (struct tgsi_sampler **)
+                            softpipe->tgsi.geom_samplers_list);
+
    if (debug_get_bool_option( "SP_NO_RAST", FALSE ))
       softpipe->no_rast = TRUE;
 
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index e641a81..53115a8 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -60,6 +60,7 @@
    struct pipe_blend_state *blend;
    struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
    struct pipe_sampler_state *vertex_samplers[PIPE_MAX_VERTEX_SAMPLERS];
+   struct pipe_sampler_state *geometry_samplers[PIPE_MAX_GEOMETRY_SAMPLERS];
    struct pipe_depth_stencil_alpha_state *depth_stencil;
    struct pipe_rasterizer_state *rasterizer;
    struct sp_fragment_shader *fs;
@@ -78,6 +79,7 @@
    struct pipe_scissor_state scissor;
    struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
    struct pipe_sampler_view *vertex_sampler_views[PIPE_MAX_VERTEX_SAMPLERS];
+   struct pipe_sampler_view *geometry_sampler_views[PIPE_MAX_GEOMETRY_SAMPLERS];
    struct pipe_viewport_state viewport;
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
    struct {
@@ -92,6 +94,8 @@
    unsigned num_sampler_views;
    unsigned num_vertex_samplers;
    unsigned num_vertex_sampler_views;
+   unsigned num_geometry_samplers;
+   unsigned num_geometry_sampler_views;
    unsigned num_vertex_buffers;
 
    unsigned dirty; /**< Mask of SP_NEW_x flags */
@@ -148,6 +152,7 @@
 
    /** TGSI exec things */
    struct {
+      struct sp_sampler_varient *geom_samplers_list[PIPE_MAX_GEOMETRY_SAMPLERS];
       struct sp_sampler_varient *vert_samplers_list[PIPE_MAX_VERTEX_SAMPLERS];
       struct sp_sampler_varient *frag_samplers_list[PIPE_MAX_SAMPLERS];
    } tgsi;
@@ -169,6 +174,7 @@
    unsigned tex_timestamp;
    struct softpipe_tex_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
    struct softpipe_tex_tile_cache *vertex_tex_cache[PIPE_MAX_VERTEX_SAMPLERS];
+   struct softpipe_tex_tile_cache *geometry_tex_cache[PIPE_MAX_GEOMETRY_SAMPLERS];
 
    unsigned use_sse : 1;
    unsigned dump_fs : 1;
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index 5024fc8..4a53ef0 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -56,6 +56,9 @@
       for (i = 0; i < softpipe->num_vertex_sampler_views; i++) {
          sp_flush_tex_tile_cache(softpipe->vertex_tex_cache[i]);
       }
+      for (i = 0; i < softpipe->num_geometry_sampler_views; i++) {
+         sp_flush_tex_tile_cache(softpipe->geometry_tex_cache[i]);
+      }
    }
 
    if (flags & PIPE_FLUSH_SWAPBUFFERS) {
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index ddfe56f..c60249d 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -543,15 +543,17 @@
 }
 
 static void
-sp_vbuf_so_info(struct vbuf_render *vbr, uint buffer, uint vertices)
+sp_vbuf_so_info(struct vbuf_render *vbr, uint primitives, uint vertices)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
+   unsigned i;
 
-   softpipe->so_target.so_count[buffer] += vertices;
+   for (i = 0; i < softpipe->so_target.num_buffers; ++i) {
+      softpipe->so_target.so_count[i] += vertices;
+   }
 
-   softpipe->so_stats.num_primitives_written =
-      vertices / u_vertices_per_prim(cvbr->prim);
+   softpipe->so_stats.num_primitives_written = primitives;
    softpipe->so_stats.primitives_storage_needed =
       vertices * 4 /*sizeof(float|int32)*/ * 4 /*x,y,z,w*/;
 }
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index 245f1b5..4ae69c1 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -58,7 +58,9 @@
 
    assert(type == PIPE_QUERY_OCCLUSION_COUNTER ||
           type == PIPE_QUERY_TIME_ELAPSED ||
-          type == PIPE_QUERY_SO_STATISTICS);
+          type == PIPE_QUERY_SO_STATISTICS ||
+          type == PIPE_QUERY_GPU_FINISHED ||
+          type == PIPE_QUERY_TIMESTAMP_DISJOINT);
    sq = CALLOC_STRUCT( softpipe_query );
    sq->type = type;
 
@@ -78,7 +80,7 @@
 {
    struct softpipe_context *softpipe = softpipe_context( pipe );
    struct softpipe_query *sq = softpipe_query(q);
-   
+
    switch (sq->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
       sq->start = softpipe->occlusion_count;
@@ -90,6 +92,9 @@
       sq->so.num_primitives_written = 0;
       sq->so.primitives_storage_needed = 0;
       break;
+   case PIPE_QUERY_GPU_FINISHED:
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
    default:
       assert(0);
       break;
@@ -119,6 +124,9 @@
       sq->so.primitives_storage_needed =
          softpipe->so_stats.primitives_storage_needed;
       break;
+   case PIPE_QUERY_GPU_FINISHED:
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      break;
    default:
       assert(0);
       break;
@@ -141,6 +149,18 @@
       memcpy(vresult, &sq->so,
              sizeof(struct pipe_query_data_so_statistics));
       break;
+   case PIPE_QUERY_GPU_FINISHED:
+      *result = TRUE;
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT: {
+      struct pipe_query_data_timestamp_disjoint td;
+      /*os_get_time is in microseconds*/
+      td.frequency = 1000000;
+      td.disjoint = FALSE;
+      memcpy(vresult, &sq->so,
+             sizeof(struct pipe_query_data_timestamp_disjoint));
+   }
+      break;
    default:
       *result = sq->end - sq->start;
       break;
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index d0b73cc..7d6b86d 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -100,6 +100,7 @@
 struct sp_geometry_shader {
    struct pipe_shader_state shader;
    struct draw_geometry_shader *draw_data;
+   int max_sampler;
 };
 
 struct sp_velems_state {
@@ -128,6 +129,10 @@
 softpipe_bind_vertex_sampler_states(struct pipe_context *,
                                     unsigned num_samplers,
                                     void **samplers);
+void
+softpipe_bind_geometry_sampler_states(struct pipe_context *,
+                                      unsigned num_samplers,
+                                      void **samplers);
 void softpipe_delete_sampler_state(struct pipe_context *, void *);
 
 void *
@@ -195,6 +200,11 @@
                                   unsigned num,
                                   struct pipe_sampler_view **);
 
+void
+softpipe_set_geometry_sampler_views(struct pipe_context *,
+                                    unsigned num,
+                                    struct pipe_sampler_view **);
+
 struct pipe_sampler_view *
 softpipe_create_sampler_view(struct pipe_context *pipe,
                              struct pipe_resource *texture,
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 4c6d490..3ba4d93 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -225,6 +225,19 @@
          }
       }
    }
+
+   for (i = 0; i < PIPE_MAX_GEOMETRY_SAMPLERS; i++) {
+      struct softpipe_tex_tile_cache *tc = softpipe->geometry_tex_cache[i];
+
+      if (tc->texture) {
+         struct softpipe_resource *spt = softpipe_resource(tc->texture);
+
+         if (spt->timestamp != tc->timestamp) {
+	    sp_tex_tile_cache_validate_texture(tc);
+            tc->timestamp = spt->timestamp;
+         }
+      }
+   }
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index 2fff80c..3fbf1f2 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -35,6 +35,7 @@
 #include "util/u_inlines.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vs.h"
+#include "draw/draw_gs.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_scan.h"
@@ -223,6 +224,8 @@
    if (state->draw_data == NULL)
       goto fail;
 
+   state->max_sampler = state->draw_data->info.file_max[TGSI_FILE_SAMPLER];
+
    return state;
 
 fail:
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
index 2692f06..79d9516 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -121,6 +121,33 @@
    softpipe->dirty |= SP_NEW_SAMPLER;
 }
 
+void
+softpipe_bind_geometry_sampler_states(struct pipe_context *pipe,
+                                      unsigned num_samplers,
+                                      void **samplers)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   unsigned i;
+
+   assert(num_samplers <= PIPE_MAX_GEOMETRY_SAMPLERS);
+
+   /* Check for no-op */
+   if (num_samplers == softpipe->num_geometry_samplers &&
+       !memcmp(softpipe->geometry_samplers, samplers, num_samplers * sizeof(void *)))
+      return;
+
+   draw_flush(softpipe->draw);
+
+   for (i = 0; i < num_samplers; ++i)
+      softpipe->geometry_samplers[i] = samplers[i];
+   for (i = num_samplers; i < PIPE_MAX_GEOMETRY_SAMPLERS; ++i)
+      softpipe->geometry_samplers[i] = NULL;
+
+   softpipe->num_geometry_samplers = num_samplers;
+
+   softpipe->dirty |= SP_NEW_SAMPLER;
+}
+
 
 struct pipe_sampler_view *
 softpipe_create_sampler_view(struct pipe_context *pipe,
@@ -210,6 +237,36 @@
    softpipe->dirty |= SP_NEW_TEXTURE;
 }
 
+void
+softpipe_set_geometry_sampler_views(struct pipe_context *pipe,
+                                    unsigned num,
+                                    struct pipe_sampler_view **views)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_GEOMETRY_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == softpipe->num_geometry_sampler_views &&
+       !memcmp(softpipe->geometry_sampler_views, views, num * sizeof(struct pipe_sampler_view *))) {
+      return;
+   }
+
+   draw_flush(softpipe->draw);
+
+   for (i = 0; i < PIPE_MAX_GEOMETRY_SAMPLERS; i++) {
+      struct pipe_sampler_view *view = i < num ? views[i] : NULL;
+
+      pipe_sampler_view_reference(&softpipe->geometry_sampler_views[i], view);
+      sp_tex_tile_cache_set_sampler_view(softpipe->geometry_tex_cache[i], view);
+   }
+
+   softpipe->num_geometry_sampler_views = num;
+
+   softpipe->dirty |= SP_NEW_TEXTURE;
+}
+
 
 /**
  * Find/create an sp_sampler_varient object for sampling the given texture,
@@ -293,6 +350,30 @@
       }
    }
 
+   if (softpipe->gs) {
+      for (i = 0; i <= softpipe->gs->max_sampler; i++) {
+         if (softpipe->geometry_samplers[i]) {
+            struct pipe_resource *texture = NULL;
+
+            if (softpipe->geometry_sampler_views[i]) {
+               texture = softpipe->geometry_sampler_views[i]->texture;
+            }
+
+            softpipe->tgsi.geom_samplers_list[i] =
+               get_sampler_varient(
+                  i,
+                  sp_sampler(softpipe->geometry_samplers[i]),
+                  texture,
+                  TGSI_PROCESSOR_GEOMETRY );
+
+            sp_sampler_varient_bind_texture(
+               softpipe->tgsi.geom_samplers_list[i],
+               softpipe->geometry_tex_cache[i],
+               texture );
+         }
+      }
+   }
+
    for (i = 0; i <= softpipe->fs->info.file_max[TGSI_FILE_SAMPLER]; i++) {
       if (softpipe->sampler[i]) {
          struct pipe_resource *texture = NULL;
diff --git a/src/gallium/drivers/softpipe/sp_state_so.c b/src/gallium/drivers/softpipe/sp_state_so.c
index 27acd3d..cfe23f9 100644
--- a/src/gallium/drivers/softpipe/sp_state_so.c
+++ b/src/gallium/drivers/softpipe/sp_state_so.c
@@ -89,6 +89,8 @@
    void *map_buffers[PIPE_MAX_SO_BUFFERS];
 
    assert(num_buffers <= PIPE_MAX_SO_BUFFERS);
+   if (num_buffers > PIPE_MAX_SO_BUFFERS)
+      num_buffers = PIPE_MAX_SO_BUFFERS;
 
    softpipe->dirty |= SP_NEW_SO_BUFFERS;
 
@@ -99,7 +101,7 @@
       if (!res) {
          /* the whole call is invalid, bail out */
          softpipe->so_target.num_buffers = 0;
-         draw_set_mapped_so_buffers(softpipe->draw, map_buffers, 0);
+         draw_set_mapped_so_buffers(softpipe->draw, 0, 0);
          return;
       }
 
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index 72afad6..7ec3d63 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -158,6 +158,9 @@
    void   (*bind_vertex_sampler_states)(struct pipe_context *,
                                         unsigned num_samplers,
                                         void **samplers);
+   void   (*bind_geometry_sampler_states)(struct pipe_context *,
+                                          unsigned num_samplers,
+                                          void **samplers);
    void   (*delete_sampler_state)(struct pipe_context *, void *);
 
    void * (*create_rasterizer_state)(struct pipe_context *,
@@ -238,6 +241,10 @@
                                     unsigned num_views,
                                     struct pipe_sampler_view **);
 
+   void (*set_geometry_sampler_views)(struct pipe_context *,
+                                      unsigned num_views,
+                                      struct pipe_sampler_view **);
+
    void (*set_vertex_buffers)( struct pipe_context *,
                                unsigned num_buffers,
                                const struct pipe_vertex_buffer * );
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 85551ca..3b87d99 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -382,7 +382,9 @@
 #define PIPE_QUERY_PRIMITIVES_EMITTED    2
 #define PIPE_QUERY_TIME_ELAPSED          3
 #define PIPE_QUERY_SO_STATISTICS         5
-#define PIPE_QUERY_TYPES                 6
+#define PIPE_QUERY_GPU_FINISHED          6
+#define PIPE_QUERY_TIMESTAMP_DISJOINT    7
+#define PIPE_QUERY_TYPES                 8
 
 
 /**
@@ -507,6 +509,11 @@
    uint64_t num_primitives_written;
    uint64_t primitives_storage_needed;
 };
+struct pipe_query_data_timestamp_disjoint
+{
+   uint64_t frequency;
+   boolean  disjoint;
+};
 
 #ifdef __cplusplus
 }
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index c46c7e3..9df20ea 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -65,16 +65,18 @@
 };
 
 enum tgsi_file_type {
-   TGSI_FILE_NULL         =0,
-   TGSI_FILE_CONSTANT     =1,
-   TGSI_FILE_INPUT        =2,
-   TGSI_FILE_OUTPUT       =3,
-   TGSI_FILE_TEMPORARY    =4,
-   TGSI_FILE_SAMPLER      =5,
-   TGSI_FILE_ADDRESS      =6,
-   TGSI_FILE_IMMEDIATE    =7,
-   TGSI_FILE_PREDICATE    =8,
-   TGSI_FILE_SYSTEM_VALUE =9,
+   TGSI_FILE_NULL                =0,
+   TGSI_FILE_CONSTANT            =1,
+   TGSI_FILE_INPUT               =2,
+   TGSI_FILE_OUTPUT              =3,
+   TGSI_FILE_TEMPORARY           =4,
+   TGSI_FILE_SAMPLER             =5,
+   TGSI_FILE_ADDRESS             =6,
+   TGSI_FILE_IMMEDIATE           =7,
+   TGSI_FILE_PREDICATE           =8,
+   TGSI_FILE_SYSTEM_VALUE        =9,
+   TGSI_FILE_IMMEDIATE_ARRAY     =10,
+   TGSI_FILE_TEMPORARY_ARRAY     =11,
    TGSI_FILE_COUNT      /**< how many TGSI_FILE_ types */
 };
 
@@ -159,9 +161,9 @@
 struct tgsi_immediate
 {
    unsigned Type       : 4;  /**< TGSI_TOKEN_TYPE_IMMEDIATE */
-   unsigned NrTokens   : 8;  /**< UINT */
+   unsigned NrTokens   : 14; /**< UINT */
    unsigned DataType   : 4;  /**< one of TGSI_IMM_x */
-   unsigned Padding    : 16;
+   unsigned Padding    : 10;
 };
 
 union tgsi_immediate_data
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 5ed1cca..6231f06 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -60,6 +60,7 @@
 #define PIPE_MAX_CONSTANT_BUFFERS 32
 #define PIPE_MAX_SAMPLERS         16
 #define PIPE_MAX_VERTEX_SAMPLERS  16
+#define PIPE_MAX_GEOMETRY_SAMPLERS  16
 #define PIPE_MAX_SHADER_INPUTS    16
 #define PIPE_MAX_SHADER_OUTPUTS   16
 #define PIPE_MAX_TEXTURE_LEVELS   16
diff --git a/src/gallium/state_trackers/egl/Makefile b/src/gallium/state_trackers/egl/Makefile
index 8933890..fec178f 100644
--- a/src/gallium/state_trackers/egl/Makefile
+++ b/src/gallium/state_trackers/egl/Makefile
@@ -31,15 +31,20 @@
 kms_OBJECTS = $(kms_SOURCES:.c=.o)
 
 
-ALL_INCLUDES = $(common_INCLUDES) $(x11_INCLUDES) $(kms_INCLUDES)
-ALL_SOURCES = $(common_SOURCES) $(x11_SOURCES) $(kms_SOURCES)
-ALL_OBJECTS = $(common_OBJECTS) $(x11_OBJECTS) $(kms_OBJECTS)
+fbdev_INCLUDES = -I$(TOP)/src/gallium/winsys/sw -I$(TOP)/src/gallium/drivers
+fbdev_SOURCES = $(wildcard fbdev/*.c)
+fbdev_OBJECTS = $(fbdev_SOURCES:.c=.o)
+
+
+ALL_INCLUDES = $(common_INCLUDES) $(x11_INCLUDES) $(kms_INCLUDES) $(fbdev_INCLUDES)
+ALL_SOURCES = $(common_SOURCES) $(x11_SOURCES) $(kms_SOURCES) $(fbdev_SOURCES)
+ALL_OBJECTS = $(common_OBJECTS) $(x11_OBJECTS) $(kms_OBJECTS) $(fbdev_OBJECTS)
 
 ##### TARGETS #####
 
-EGL_DISPLAYS_MODS = $(foreach dpy, $(EGL_DISPLAYS), libegl$(dpy).a)
+EGL_PLATFORMS_MODS = $(foreach plat, $(EGL_PLATFORMS), libegl$(plat).a)
 
-default: depend $(EGL_DISPLAYS_MODS)
+default: depend $(EGL_PLATFORMS_MODS)
 
 
 libeglx11.a: $(x11_OBJECTS) $(common_OBJECTS) Makefile
@@ -48,6 +53,9 @@
 libeglkms.a: $(kms_OBJECTS) $(common_OBJECTS) Makefile
 	$(MKLIB) -o eglkms -static $(kms_OBJECTS) $(common_OBJECTS)
 
+libeglfbdev.a: $(fbdev_OBJECTS) $(common_OBJECTS) Makefile
+	$(MKLIB) -o eglfbdev -static $(fbdev_OBJECTS) $(common_OBJECTS)
+
 depend: 
 	rm -f depend
 	touch depend
@@ -55,7 +63,7 @@
 
 clean:
 	rm -f $(ALL_OBJECTS)
-	rm -f $(EGL_DISPLAYS_MODS)
+	rm -f $(EGL_PLATFORMS_MODS)
 	rm -f depend depend.bak
 
 # Dummy target
@@ -73,4 +81,7 @@
 $(kms_OBJECTS): %.o: %.c
 	$(CC) -c $(common_INCLUDES) $(kms_INCLUDES) $(DEFINES) $(CFLAGS) $< -o $@
 
+$(fbdev_OBJECTS): %.o: %.c
+	$(CC) -c $(common_INCLUDES) $(fbdev_INCLUDES) $(DEFINES) $(CFLAGS) $< -o $@
+
 sinclude depend
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d.c b/src/gallium/state_trackers/egl/common/egl_g3d.c
index 361cc79..8c7d2cb 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d.c
+++ b/src/gallium/state_trackers/egl/common/egl_g3d.c
@@ -74,10 +74,10 @@
    struct native_probe *nprobe;
 
    nprobe = (struct native_probe *) _eglGetProbeCache(gdrv->probe_key);
-   if (!nprobe || nprobe->display != dpy->NativeDisplay) {
+   if (!nprobe || nprobe->display != dpy->PlatformDisplay) {
       if (nprobe)
          nprobe->destroy(nprobe);
-      nprobe = native_create_probe(dpy->NativeDisplay);
+      nprobe = native_create_probe(dpy->PlatformDisplay);
       _eglSetProbeCache(gdrv->probe_key, (void *) nprobe);
    }
 
@@ -96,7 +96,7 @@
    struct native_probe *nprobe;
 
    nprobe = (struct native_probe *) _eglGetProbeCache(gdrv->probe_key);
-   if (nprobe && (!dpy || nprobe->display == dpy->NativeDisplay)) {
+   if (nprobe && (!dpy || nprobe->display == dpy->PlatformDisplay)) {
       nprobe->destroy(nprobe);
       _eglSetProbeCache(gdrv->probe_key, NULL);
    }
@@ -479,7 +479,7 @@
    }
    dpy->DriverData = gdpy;
 
-   gdpy->native = native_create_display(dpy->NativeDisplay,
+   gdpy->native = native_create_display(dpy->PlatformDisplay,
          &egl_g3d_native_event_handler);
    if (!gdpy->native) {
       _eglError(EGL_NOT_INITIALIZED, "eglInitialize(no usable display)");
diff --git a/src/gallium/state_trackers/egl/common/native.h b/src/gallium/state_trackers/egl/common/native.h
index 3f60348..494becb 100644
--- a/src/gallium/state_trackers/egl/common/native.h
+++ b/src/gallium/state_trackers/egl/common/native.h
@@ -211,7 +211,6 @@
 native_get_name(void);
 
 struct native_display *
-native_create_display(EGLNativeDisplayType dpy,
-                      struct native_event_handler *handler);
+native_create_display(void *dpy, struct native_event_handler *handler);
 
 #endif /* _NATIVE_H_ */
diff --git a/src/gallium/state_trackers/egl/common/native_probe.h b/src/gallium/state_trackers/egl/common/native_probe.h
index aeed9f8..539c4aa 100644
--- a/src/gallium/state_trackers/egl/common/native_probe.h
+++ b/src/gallium/state_trackers/egl/common/native_probe.h
@@ -43,7 +43,7 @@
  */
 struct native_probe {
    int magic;
-   EGLNativeDisplayType display;
+   void *display;
    void *data;
 
    void (*destroy)(struct native_probe *nprobe);
@@ -57,7 +57,7 @@
  * same display.
  */
 struct native_probe *
-native_create_probe(EGLNativeDisplayType dpy);
+native_create_probe(void *dpy);
 
 /**
  * Probe the probe object.
diff --git a/src/gallium/state_trackers/egl/fbdev/native_fbdev.c b/src/gallium/state_trackers/egl/fbdev/native_fbdev.c
new file mode 100644
index 0000000..399c125
--- /dev/null
+++ b/src/gallium/state_trackers/egl/fbdev/native_fbdev.c
@@ -0,0 +1,469 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <linux/fb.h>
+
+#include "pipe/p_screen.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_pointer.h"
+
+#include "common/native.h"
+#include "common/native_helper.h"
+#include "fbdev/fbdev_sw_winsys.h"
+
+struct fbdev_display {
+   struct native_display base;
+
+   int fd;
+   struct native_event_handler *event_handler;
+
+   struct fb_fix_screeninfo finfo;
+   struct fb_var_screeninfo vinfo;
+
+   struct native_config config;
+   struct native_connector connector;
+   struct native_mode mode;
+
+   struct fbdev_surface *current_surface;
+};
+
+struct fbdev_surface {
+   struct native_surface base;
+
+   struct fbdev_display *fbdpy;
+   struct resource_surface *rsurf;
+   int width, height;
+
+   unsigned int sequence_number;
+
+   boolean is_current;
+};
+
+static INLINE struct fbdev_display *
+fbdev_display(const struct native_display *ndpy)
+{
+   return (struct fbdev_display *) ndpy;
+}
+
+static INLINE struct fbdev_surface *
+fbdev_surface(const struct native_surface *nsurf)
+{
+   return (struct fbdev_surface *) nsurf;
+}
+
+static boolean
+fbdev_surface_validate(struct native_surface *nsurf, uint attachment_mask,
+                     unsigned int *seq_num, struct pipe_resource **textures,
+                     int *width, int *height)
+{
+   struct fbdev_surface *fbsurf = fbdev_surface(nsurf);
+
+   if (!resource_surface_add_resources(fbsurf->rsurf, attachment_mask))
+      return FALSE;
+   if (textures)
+      resource_surface_get_resources(fbsurf->rsurf, textures, attachment_mask);
+
+   if (seq_num)
+      *seq_num = fbsurf->sequence_number;
+   if (width)
+      *width = fbsurf->width;
+   if (height)
+      *height = fbsurf->height;
+
+   return TRUE;
+}
+
+static boolean
+fbdev_surface_flush_frontbuffer(struct native_surface *nsurf)
+{
+   struct fbdev_surface *fbsurf = fbdev_surface(nsurf);
+
+   if (!fbsurf->is_current)
+      return TRUE;
+
+   return resource_surface_present(fbsurf->rsurf,
+         NATIVE_ATTACHMENT_FRONT_LEFT, NULL);
+}
+
+static boolean
+fbdev_surface_swap_buffers(struct native_surface *nsurf)
+{
+   struct fbdev_surface *fbsurf = fbdev_surface(nsurf);
+   struct fbdev_display *fbdpy = fbsurf->fbdpy;
+   boolean ret = TRUE;
+
+   if (fbsurf->is_current) {
+      ret = resource_surface_present(fbsurf->rsurf,
+            NATIVE_ATTACHMENT_BACK_LEFT, NULL);
+   }
+
+   resource_surface_swap_buffers(fbsurf->rsurf,
+         NATIVE_ATTACHMENT_FRONT_LEFT, NATIVE_ATTACHMENT_BACK_LEFT, TRUE);
+   /* the front/back textures are swapped */
+   fbsurf->sequence_number++;
+   fbdpy->event_handler->invalid_surface(&fbdpy->base,
+         &fbsurf->base, fbsurf->sequence_number);
+
+   return ret;
+}
+
+static void
+fbdev_surface_wait(struct native_surface *nsurf)
+{
+   /* no-op */
+}
+
+static void
+fbdev_surface_destroy(struct native_surface *nsurf)
+{
+   struct fbdev_surface *fbsurf = fbdev_surface(nsurf);
+
+   resource_surface_destroy(fbsurf->rsurf);
+   FREE(fbsurf);
+}
+
+static struct native_surface *
+fbdev_display_create_scanout_surface(struct native_display *ndpy,
+                                   const struct native_config *nconf,
+                                   uint width, uint height)
+{
+   struct fbdev_display *fbdpy = fbdev_display(ndpy);
+   struct fbdev_surface *fbsurf;
+
+   fbsurf = CALLOC_STRUCT(fbdev_surface);
+   if (!fbsurf)
+      return NULL;
+
+   fbsurf->fbdpy = fbdpy;
+   fbsurf->width = width;
+   fbsurf->height = height;
+
+   fbsurf->rsurf = resource_surface_create(fbdpy->base.screen,
+         nconf->color_format,
+         PIPE_BIND_RENDER_TARGET |
+         PIPE_BIND_DISPLAY_TARGET |
+         PIPE_BIND_SCANOUT);
+   if (!fbsurf->rsurf) {
+      FREE(fbsurf);
+      return NULL;
+   }
+
+   resource_surface_set_size(fbsurf->rsurf, fbsurf->width, fbsurf->height);
+
+   fbsurf->base.destroy = fbdev_surface_destroy;
+   fbsurf->base.swap_buffers = fbdev_surface_swap_buffers;
+   fbsurf->base.flush_frontbuffer = fbdev_surface_flush_frontbuffer;
+   fbsurf->base.validate = fbdev_surface_validate;
+   fbsurf->base.wait = fbdev_surface_wait;
+
+   return &fbsurf->base;
+}
+
+static boolean
+fbdev_display_program(struct native_display *ndpy, int crtc_idx,
+                      struct native_surface *nsurf, uint x, uint y,
+                      const struct native_connector **nconns, int num_nconns,
+                      const struct native_mode *nmode)
+{
+   struct fbdev_display *fbdpy = fbdev_display(ndpy);
+   struct fbdev_surface *fbsurf = fbdev_surface(nsurf);
+
+   if (x || y)
+      return FALSE;
+
+   if (fbdpy->current_surface) {
+      if (fbdpy->current_surface == fbsurf)
+         return TRUE;
+      fbdpy->current_surface->is_current = FALSE;
+   }
+
+   if (fbsurf)
+      fbsurf->is_current = TRUE;
+   fbdpy->current_surface = fbsurf;
+
+   return TRUE;
+}
+
+static const struct native_mode **
+fbdev_display_get_modes(struct native_display *ndpy,
+                      const struct native_connector *nconn,
+                      int *num_modes)
+{
+   struct fbdev_display *fbdpy = fbdev_display(ndpy);
+   const struct native_mode **modes;
+
+   modes = MALLOC(sizeof(*modes));
+   if (modes) {
+      modes[0] = &fbdpy->mode;
+      if (num_modes)
+         *num_modes = 1;
+   }
+
+   return modes;
+}
+
+static const struct native_connector **
+fbdev_display_get_connectors(struct native_display *ndpy, int *num_connectors,
+                           int *num_crtc)
+{
+   struct fbdev_display *fbdpy = fbdev_display(ndpy);
+   const struct native_connector **connectors;
+
+   connectors = MALLOC(sizeof(*connectors));
+   if (connectors) {
+      connectors[0] = &fbdpy->connector;
+      if (num_connectors)
+         *num_connectors = 1;
+   }
+
+   return connectors;
+}
+
+static struct native_display_modeset fbdev_display_modeset = {
+   .get_connectors = fbdev_display_get_connectors,
+   .get_modes = fbdev_display_get_modes,
+   .create_scanout_surface = fbdev_display_create_scanout_surface,
+   .program = fbdev_display_program
+};
+
+static const struct native_config **
+fbdev_display_get_configs(struct native_display *ndpy, int *num_configs)
+{
+   struct fbdev_display *fbdpy = fbdev_display(ndpy);
+   const struct native_config **configs;
+
+   configs = MALLOC(sizeof(*configs));
+   if (configs) {
+      configs[0] = &fbdpy->config;
+      if (num_configs)
+         *num_configs = 1;
+   }
+
+   return configs;
+}
+
+static int
+fbdev_display_get_param(struct native_display *ndpy,
+                      enum native_param_type param)
+{
+   int val;
+
+   switch (param) {
+   default:
+      val = 0;
+      break;
+   }
+
+   return val;
+}
+
+static void
+fbdev_display_destroy(struct native_display *ndpy)
+{
+   struct fbdev_display *fbdpy = fbdev_display(ndpy);
+
+   fbdpy->base.screen->destroy(fbdpy->base.screen);
+   close(fbdpy->fd);
+   FREE(fbdpy);
+}
+
+static boolean
+fbdev_display_init_modes(struct native_display *ndpy)
+{
+   struct fbdev_display *fbdpy = fbdev_display(ndpy);
+   struct native_mode *nmode = &fbdpy->mode;
+
+   nmode->desc = "Current Mode";
+   nmode->width = fbdpy->vinfo.xres;
+   nmode->height = fbdpy->vinfo.yres;
+   nmode->refresh_rate = 60 * 1000; /* dummy */
+
+   return TRUE;
+}
+
+static boolean
+fbdev_display_init_connectors(struct native_display *ndpy)
+{
+   return TRUE;
+}
+
+static enum pipe_format
+vinfo_to_format(const struct fb_var_screeninfo *vinfo)
+{
+   enum pipe_format format = PIPE_FORMAT_NONE;
+
+   switch (vinfo->bits_per_pixel) {
+   case 32:
+      if (vinfo->red.length == 8 &&
+          vinfo->green.length == 8 &&
+          vinfo->blue.length == 8) {
+         format = (vinfo->transp.length == 8) ?
+            PIPE_FORMAT_B8G8R8A8_UNORM : PIPE_FORMAT_B8G8R8X8_UNORM;
+      }
+      break;
+   case 16:
+      if (vinfo->red.length == 5 &&
+          vinfo->green.length == 6 &&
+          vinfo->blue.length == 5 &&
+          vinfo->transp.length == 0)
+         format = PIPE_FORMAT_B5G6R5_UNORM;
+      break;
+   default:
+      break;
+   }
+
+   return format;
+}
+
+static boolean
+fbdev_display_init_configs(struct native_display *ndpy)
+{
+   struct fbdev_display *fbdpy = fbdev_display(ndpy);
+   struct native_config *nconf = &fbdpy->config;
+
+   nconf->color_format = vinfo_to_format(&fbdpy->vinfo);
+   if (nconf->color_format == PIPE_FORMAT_NONE)
+      return FALSE;
+
+   nconf->buffer_mask =
+      (1 << NATIVE_ATTACHMENT_FRONT_LEFT) |
+      (1 << NATIVE_ATTACHMENT_BACK_LEFT);
+
+   nconf->scanout_bit = TRUE;
+
+   return TRUE;
+}
+
+static boolean
+fbdev_display_init(struct native_display *ndpy)
+{
+   struct fbdev_display *fbdpy = fbdev_display(ndpy);
+   struct sw_winsys *ws;
+
+   if (ioctl(fbdpy->fd, FBIOGET_FSCREENINFO, &fbdpy->finfo))
+      return FALSE;
+
+   if (ioctl(fbdpy->fd, FBIOGET_VSCREENINFO, &fbdpy->vinfo))
+      return FALSE;
+
+   if (fbdpy->finfo.visual != FB_VISUAL_TRUECOLOR ||
+       fbdpy->finfo.type != FB_TYPE_PACKED_PIXELS)
+      return FALSE;
+
+   if (!fbdev_display_init_configs(&fbdpy->base) ||
+       !fbdev_display_init_connectors(&fbdpy->base) ||
+       !fbdev_display_init_modes(&fbdpy->base))
+      return FALSE;
+
+   ws = fbdev_create_sw_winsys(fbdpy->fd, fbdpy->config.color_format);
+   if (ws)
+      fbdpy->base.screen = native_create_sw_screen(ws);
+
+   if (fbdpy->base.screen) {
+      if (!fbdpy->base.screen->is_format_supported(fbdpy->base.screen,
+               fbdpy->config.color_format, PIPE_TEXTURE_2D, 0,
+               PIPE_BIND_RENDER_TARGET, 0)) {
+         fbdpy->base.screen->destroy(fbdpy->base.screen);
+         fbdpy->base.screen = NULL;
+      }
+   }
+
+   return (fbdpy->base.screen != NULL);
+}
+
+static struct native_display *
+fbdev_display_create(int fd, struct native_event_handler *event_handler)
+{
+   struct fbdev_display *fbdpy;
+
+   fbdpy = CALLOC_STRUCT(fbdev_display);
+   if (!fbdpy)
+      return NULL;
+
+   fbdpy->fd = fd;
+   fbdpy->event_handler = event_handler;
+
+   if (!fbdev_display_init(&fbdpy->base)) {
+      FREE(fbdpy);
+      return NULL;
+   }
+
+   fbdpy->base.destroy = fbdev_display_destroy;
+   fbdpy->base.get_param = fbdev_display_get_param;
+   fbdpy->base.get_configs = fbdev_display_get_configs;
+
+   fbdpy->base.modeset = &fbdev_display_modeset;
+
+   return &fbdpy->base;
+}
+
+struct native_probe *
+native_create_probe(void *dpy)
+{
+   return NULL;
+}
+
+enum native_probe_result
+native_get_probe_result(struct native_probe *nprobe)
+{
+   return NATIVE_PROBE_UNKNOWN;
+}
+
+const char *
+native_get_name(void)
+{
+   return "FBDEV";
+}
+
+struct native_display *
+native_create_display(void *dpy, struct native_event_handler *event_handler)
+{
+   struct native_display *ndpy;
+   int fd;
+
+   /* well, this makes fd 0 being ignored */
+   if (!dpy) {
+      fd = open("/dev/fb0", O_RDWR);
+   }
+   else {
+      fd = dup((int) pointer_to_intptr(dpy));
+   }
+   if (fd < 0)
+      return NULL;
+
+   ndpy = fbdev_display_create(fd, event_handler);
+   if (!ndpy)
+      close(fd);
+
+   return ndpy;
+}
diff --git a/src/gallium/state_trackers/egl/gdi/native_gdi.c b/src/gallium/state_trackers/egl/gdi/native_gdi.c
index 1791d19..56f190d 100644
--- a/src/gallium/state_trackers/egl/gdi/native_gdi.c
+++ b/src/gallium/state_trackers/egl/gdi/native_gdi.c
@@ -367,7 +367,7 @@
 }
 
 struct native_probe *
-native_create_probe(EGLNativeDisplayType dpy)
+native_create_probe(void *dpy)
 {
    return NULL;
 }
@@ -385,8 +385,7 @@
 }
 
 struct native_display *
-native_create_display(EGLNativeDisplayType dpy,
-                      struct native_event_handler *event_handler)
+native_create_display(void *dpy, struct native_event_handler *event_handler)
 {
    struct sw_winsys *winsys;
    struct pipe_screen *screen;
diff --git a/src/gallium/state_trackers/egl/kms/native_kms.c b/src/gallium/state_trackers/egl/kms/native_kms.c
index bfb4a9d..f90b871 100644
--- a/src/gallium/state_trackers/egl/kms/native_kms.c
+++ b/src/gallium/state_trackers/egl/kms/native_kms.c
@@ -779,7 +779,7 @@
 }
 
 struct native_probe *
-native_create_probe(EGLNativeDisplayType dpy)
+native_create_probe(void *dpy)
 {
    return NULL;
 }
@@ -810,8 +810,7 @@
 }
 
 struct native_display *
-native_create_display(EGLNativeDisplayType dpy,
-                      struct native_event_handler *event_handler)
+native_create_display(void *dpy, struct native_event_handler *event_handler)
 {
    struct native_display *ndpy = NULL;
    int fd;
diff --git a/src/gallium/state_trackers/egl/x11/native_dri2.c b/src/gallium/state_trackers/egl/x11/native_dri2.c
index 3f802dd..e90c33b 100644
--- a/src/gallium/state_trackers/egl/x11/native_dri2.c
+++ b/src/gallium/state_trackers/egl/x11/native_dri2.c
@@ -741,7 +741,7 @@
 }
 
 struct native_display *
-x11_create_dri2_display(EGLNativeDisplayType dpy,
+x11_create_dri2_display(Display *dpy,
                         struct native_event_handler *event_handler,
                         struct drm_api *api)
 {
diff --git a/src/gallium/state_trackers/egl/x11/native_x11.c b/src/gallium/state_trackers/egl/x11/native_x11.c
index b6d51bb..bfa12b2 100644
--- a/src/gallium/state_trackers/egl/x11/native_x11.c
+++ b/src/gallium/state_trackers/egl/x11/native_x11.c
@@ -46,7 +46,7 @@
 }
 
 struct native_probe *
-native_create_probe(EGLNativeDisplayType dpy)
+native_create_probe(void *dpy)
 {
    struct native_probe *nprobe;
    struct x11_screen *xscr;
@@ -127,8 +127,7 @@
 }
 
 struct native_display *
-native_create_display(EGLNativeDisplayType dpy,
-                      struct native_event_handler *event_handler)
+native_create_display(void *dpy, struct native_event_handler *event_handler)
 {
    struct native_display *ndpy = NULL;
    boolean force_sw;
@@ -138,14 +137,14 @@
 
    force_sw = debug_get_bool_option("EGL_SOFTWARE", FALSE);
    if (api && !force_sw) {
-      ndpy = x11_create_dri2_display(dpy, event_handler, api);
+      ndpy = x11_create_dri2_display((Display *) dpy, event_handler, api);
    }
 
    if (!ndpy) {
       EGLint level = (force_sw) ? _EGL_INFO : _EGL_WARNING;
 
       _eglLog(level, "use software fallback");
-      ndpy = x11_create_ximage_display(dpy, event_handler);
+      ndpy = x11_create_ximage_display((Display *) dpy, event_handler);
    }
 
    return ndpy;
diff --git a/src/gallium/state_trackers/egl/x11/native_x11.h b/src/gallium/state_trackers/egl/x11/native_x11.h
index 1678403..f1fea7f 100644
--- a/src/gallium/state_trackers/egl/x11/native_x11.h
+++ b/src/gallium/state_trackers/egl/x11/native_x11.h
@@ -30,11 +30,11 @@
 #include "common/native.h"
 
 struct native_display *
-x11_create_ximage_display(EGLNativeDisplayType dpy,
+x11_create_ximage_display(Display *dpy,
                           struct native_event_handler *event_handler);
 
 struct native_display *
-x11_create_dri2_display(EGLNativeDisplayType dpy,
+x11_create_dri2_display(Display *dpy,
                         struct native_event_handler *event_handler,
                         struct drm_api *api);
 
diff --git a/src/gallium/state_trackers/egl/x11/native_ximage.c b/src/gallium/state_trackers/egl/x11/native_ximage.c
index 45679fc..ee10a04 100644
--- a/src/gallium/state_trackers/egl/x11/native_ximage.c
+++ b/src/gallium/state_trackers/egl/x11/native_ximage.c
@@ -441,7 +441,7 @@
 }
 
 struct native_display *
-x11_create_ximage_display(EGLNativeDisplayType dpy,
+x11_create_ximage_display(Display *dpy,
                           struct native_event_handler *event_handler)
 {
    struct ximage_display *xdpy;
diff --git a/src/gallium/targets/Makefile.egl b/src/gallium/targets/Makefile.egl
index 4fa13e8..3158560 100644
--- a/src/gallium/targets/Makefile.egl
+++ b/src/gallium/targets/Makefile.egl
@@ -24,15 +24,26 @@
 
 x11_LIBS = $(common_LIBS) -lX11 -lXext -lXfixes
 
+kms_ST = $(TOP)/src/gallium/state_trackers/egl/libeglkms.a
+kms_LIBS = $(common_LIBS)
+
+fbdev_ST = \
+	$(TOP)/src/gallium/state_trackers/egl/libeglfbdev.a \
+	$(TOP)/src/gallium/winsys/sw/fbdev/libfbdev.a \
+	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
+	$(TOP)/src/gallium/drivers/identity/libidentity.a \
+	$(TOP)/src/gallium/drivers/trace/libtrace.a \
+	$(TOP)/src/gallium/drivers/rbug/librbug.a
+fbdev_LIBS = $(common_LIBS)
+
 ifeq ($(MESA_LLVM),1)
 x11_ST += $(TOP)/src/gallium/drivers/llvmpipe/libllvmpipe.a
 x11_LIBS += $(LLVM_LIBS)
+fbdev_ST += $(TOP)/src/gallium/drivers/llvmpipe/libllvmpipe.a
+fbdev_LIBS += $(LLVM_LIBS)
 LDFLAGS += $(LLVM_LDFLAGS)
 endif
 
-kms_ST = $(TOP)/src/gallium/state_trackers/egl/libeglkms.a
-kms_LIBS = $(common_LIBS)
-
 ### Include directories
 INCLUDES = \
 	-I$(TOP)/include \
@@ -51,13 +62,19 @@
 
 ##### TARGETS #####
 
-EGL_DISPLAY_DRIVERS = $(foreach dpy, $(EGL_DISPLAYS), egl_$(dpy)_$(EGL_DRIVER_NAME).so)
+ifeq ($(EGL_DRIVER_NAME),swrast)
+EGL_PLATFORMS := $(filter-out kms, $(EGL_PLATFORMS))
+else
+EGL_PLATFORMS := $(filter-out fbdev, $(EGL_PLATFORMS))
+endif
 
-EGL_DISPLAY_LIBS = $(foreach drv, $(EGL_DISPLAY_DRIVERS), $(TOP)/$(LIB_DIR)/egl/$(drv))
+EGL_PLATFORM_DRIVERS = $(foreach plat, $(EGL_PLATFORMS), egl_$(plat)_$(EGL_DRIVER_NAME).so)
 
-default: $(EGL_DISPLAY_LIBS)
+EGL_PLATFORM_LIBS = $(foreach drv, $(EGL_PLATFORM_DRIVERS), $(TOP)/$(LIB_DIR)/egl/$(drv))
 
-$(EGL_DISPLAY_LIBS): $(TOP)/$(LIB_DIR)/egl/%.so: %.so
+default: $(EGL_PLATFORM_LIBS)
+
+$(EGL_PLATFORM_LIBS): $(TOP)/$(LIB_DIR)/egl/%.so: %.so
 	@$(INSTALL) -d $(TOP)/$(LIB_DIR)/egl
 	$(INSTALL) $< $(TOP)/$(LIB_DIR)/egl
 
@@ -75,13 +92,16 @@
 egl_kms_$(EGL_DRIVER_NAME).so: $(EGL_DRIVER_OBJECTS) $(kms_ST) $(EGL_DRIVER_PIPES) $(GALLIUM_AUXILIARIES) Makefile
 	$(call mklib-egl,kms)
 
+egl_fbdev_$(EGL_DRIVER_NAME).so: $(EGL_DRIVER_OBJECTS) $(fbdev_ST) $(EGL_DRIVER_PIPES) $(GALLIUM_AUXILIARIES) Makefile
+	$(call mklib-egl,fbdev)
+
 clean:
 	-rm -f $(EGL_DRIVER_OBJECTS)
-	-rm -f $(EGL_DISPLAY_DRIVERS)
+	-rm -f $(EGL_PLATFORM_DRIVERS)
 
-install: $(EGL_DISPLAY_LIBS)
+install: $(EGL_PLATFORM_LIBS)
 	$(INSTALL) -d $(DESTDIR)$(EGL_DRIVER_INSTALL_DIR)
-	for lib in $(EGL_DISPLAY_LIBS); do \
+	for lib in $(EGL_PLATFORM_LIBS); do \
 		$(MINSTALL) -m 755 "$$lib" $(DESTDIR)$(EGL_DRIVER_INSTALL_DIR); \
 	done
 
diff --git a/src/gallium/targets/dri-radeong/Makefile b/src/gallium/targets/dri-radeong/Makefile
index 8ef24c0..8ba1972 100644
--- a/src/gallium/targets/dri-radeong/Makefile
+++ b/src/gallium/targets/dri-radeong/Makefile
@@ -7,6 +7,7 @@
 	$(TOP)/src/gallium/state_trackers/dri/drm/libdridrm.a \
 	$(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \
 	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
+	$(TOP)/src/gallium/drivers/galahad/libgalahad.a \
 	$(TOP)/src/gallium/drivers/trace/libtrace.a \
 	$(TOP)/src/gallium/drivers/rbug/librbug.a \
 	$(TOP)/src/gallium/drivers/r300/libr300.a
diff --git a/src/gallium/targets/egl-radeon/Makefile b/src/gallium/targets/egl-radeon/Makefile
index 8fcca26..64c20af 100644
--- a/src/gallium/targets/egl-radeon/Makefile
+++ b/src/gallium/targets/egl-radeon/Makefile
@@ -7,6 +7,7 @@
 
 EGL_DRIVER_PIPES = \
 	$(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \
+	$(TOP)/src/gallium/drivers/galahad/libgalahad.a \
 	$(TOP)/src/gallium/drivers/trace/libtrace.a \
 	$(TOP)/src/gallium/drivers/rbug/librbug.a \
 	$(TOP)/src/gallium/drivers/r300/libr300.a
diff --git a/src/gallium/targets/xorg-radeon/Makefile b/src/gallium/targets/xorg-radeon/Makefile
index a4951c4..6cbc61e 100644
--- a/src/gallium/targets/xorg-radeon/Makefile
+++ b/src/gallium/targets/xorg-radeon/Makefile
@@ -13,10 +13,11 @@
         $(TOP)/src/gallium/state_trackers/xorg/libxorgtracker.a \
         $(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \
         $(TOP)/src/gallium/drivers/r300/libr300.a \
+		$(TOP)/src/gallium/drivers/galahad/libgalahad.a \
         $(TOP)/src/gallium/drivers/trace/libtrace.a \
         $(TOP)/src/gallium/drivers/rbug/librbug.a \
         $(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
         $(GALLIUM_AUXILIARIES) \
-	$(shell pkg-config --libs libdrm libdrm_intel)
+	$(shell pkg-config --libs libdrm libdrm_radeon)
 
 include ../Makefile.xorg
diff --git a/src/gallium/tests/graw/fs-test.c b/src/gallium/tests/graw/fs-test.c
index 3389efb..dea0873 100644
--- a/src/gallium/tests/graw/fs-test.c
+++ b/src/gallium/tests/graw/fs-test.c
@@ -42,7 +42,8 @@
 static struct pipe_screen *screen = NULL;
 static struct pipe_context *ctx = NULL;
 static struct pipe_resource *rttex = NULL;
-static struct pipe_resource *constbuf = NULL;
+static struct pipe_resource *constbuf1 = NULL;
+static struct pipe_resource *constbuf2 = NULL;
 static struct pipe_surface *surf = NULL;
 static struct pipe_sampler_view *sv = NULL;
 static void *sampler = NULL;
@@ -73,7 +74,7 @@
      { -1, 0, 0, 1 } },
 };
 
-static float constants[] = 
+static float constants1[] = 
 {  0.4, 0, 0,  1,
    1,   1, 1,  1,
    2,   2, 2,  2,
@@ -81,7 +82,25 @@
 
    3,  0, 0, 0,
    0, .5, 0, 0,
-   0,  0, 1, 0,
+   1,  0, 0, 1,
+   0,  0, 0, 1,
+
+   1, 0, 0, 0.5,
+   0, 1, 0, 0.5,
+   0, 0, 1, 0,
+   0, 0, 0, 1,
+};
+
+
+static float constants2[] = 
+{  1, 0, 0,  1,
+   0, 1, 0,  1,
+   0, 0, 1,  1,
+   0, 0, 0,  0,
+
+   1,  1, 0, 1,
+   1, .5, 0, 1,
+   1,  0, 0, 1,
    0,  0, 0, 1,
 
    1, 0, 0, 0.5,
@@ -97,34 +116,58 @@
 
    templat.target = PIPE_BUFFER;
    templat.format = PIPE_FORMAT_R8_UNORM;
-   templat.width0 = sizeof(constants);
+   templat.width0 = sizeof(constants1);
    templat.height0 = 1;
    templat.depth0 = 1;
    templat.last_level = 0;
    templat.nr_samples = 1;
    templat.bind = PIPE_BIND_CONSTANT_BUFFER;
 
-   constbuf = screen->resource_create(screen,
-                                      &templat);
-   if (constbuf == NULL)
+   constbuf1 = screen->resource_create(screen,
+                                       &templat);
+   if (constbuf1 == NULL)
+      exit(4);
+
+   constbuf2 = screen->resource_create(screen,
+                                       &templat);
+   if (constbuf2 == NULL)
       exit(4);
 
 
-   u_box_2d(0,0,sizeof(constants),1, &box);
+   {
+      u_box_2d(0,0,sizeof(constants1),1, &box);
 
-   ctx->transfer_inline_write(ctx,
-                              constbuf,
-                              u_subresource(0,0),
-                              PIPE_TRANSFER_WRITE,
-                              &box,
-                              constants,
-                              sizeof constants,
-                              sizeof constants);
+      ctx->transfer_inline_write(ctx,
+                                 constbuf1,
+                                 u_subresource(0,0),
+                                 PIPE_TRANSFER_WRITE,
+                                 &box,
+                                 constants1,
+                                 sizeof constants1,
+                                 sizeof constants1);
 
 
-   ctx->set_constant_buffer(ctx,
-                            PIPE_SHADER_FRAGMENT, 0,
-                            constbuf);
+      ctx->set_constant_buffer(ctx,
+                               PIPE_SHADER_FRAGMENT, 0,
+                               constbuf1);
+   }
+   {
+      u_box_2d(0,0,sizeof(constants2),1, &box);
+
+      ctx->transfer_inline_write(ctx,
+                                 constbuf2,
+                                 u_subresource(0,0),
+                                 PIPE_TRANSFER_WRITE,
+                                 &box,
+                                 constants2,
+                                 sizeof constants2,
+                                 sizeof constants2);
+
+
+      ctx->set_constant_buffer(ctx,
+                               PIPE_SHADER_FRAGMENT, 1,
+                               constbuf2);
+   }
 }
 
 
diff --git a/src/gallium/tests/graw/geometry-shader/mov-cb-2d.txt b/src/gallium/tests/graw/geometry-shader/mov-cb-2d.txt
new file mode 100644
index 0000000..058acfb
--- /dev/null
+++ b/src/gallium/tests/graw/geometry-shader/mov-cb-2d.txt
@@ -0,0 +1,24 @@
+GEOM
+PROPERTY GS_INPUT_PRIMITIVE TRIANGLES
+PROPERTY GS_OUTPUT_PRIMITIVE TRIANGLE_STRIP
+DCL IN[][0], POSITION, CONSTANT
+DCL IN[][1], COLOR, CONSTANT
+DCL OUT[0], POSITION, CONSTANT
+DCL OUT[1], COLOR, CONSTANT
+DCL CONST[1][0..6]
+
+MOV OUT[0], IN[0][0]
+MOV OUT[1], CONST[1][0]
+EMIT
+
+MOV OUT[0], IN[1][0]
+MOV OUT[1], CONST[1][1]
+EMIT
+
+MOV OUT[0], IN[2][0]
+MOV OUT[1], CONST[1][4]
+EMIT
+
+ENDPRIM
+
+END
diff --git a/src/gallium/tests/graw/geometry-shader/mov.txt b/src/gallium/tests/graw/geometry-shader/mov.txt
index c37051d..97150a5 100644
--- a/src/gallium/tests/graw/geometry-shader/mov.txt
+++ b/src/gallium/tests/graw/geometry-shader/mov.txt
@@ -11,7 +11,7 @@
 EMIT
 
 MOV OUT[0], IN[1][0]
-MOV OUT[1], IN[0][1]
+MOV OUT[1], IN[1][1]
 EMIT
 
 MOV OUT[0], IN[2][0]
diff --git a/src/gallium/tests/graw/gs-test.c b/src/gallium/tests/graw/gs-test.c
index e8c82ba..3087d44 100644
--- a/src/gallium/tests/graw/gs-test.c
+++ b/src/gallium/tests/graw/gs-test.c
@@ -44,7 +44,8 @@
 static struct pipe_screen *screen = NULL;
 static struct pipe_context *ctx = NULL;
 static struct pipe_resource *rttex = NULL;
-static struct pipe_resource *constbuf = NULL;
+static struct pipe_resource *constbuf1 = NULL;
+static struct pipe_resource *constbuf2 = NULL;
 static struct pipe_surface *surf = NULL;
 static struct pipe_sampler_view *sv = NULL;
 static void *sampler = NULL;
@@ -55,6 +56,7 @@
    float position[4];
    float color[4];
    float texcoord[4];
+   float generic[4];
 };
 
 /* Vertex data matches progs/fp/fp-tri.c, but flipped in Y dimension
@@ -64,37 +66,51 @@
 {
    { { 0.9, 0.9, 0.0, 1.0 },
      { 0, 0, 1, 1 },
-     { 1, 1, 0, 1 } },
+     { 1, 1, 0, 1 },
+     { 1, 0, 1, 0 }
+   },
 
    { { 0.9,  -0.9, 0.0, 1.0 },
      { 1, 0, 0, 1 },
-     { 1, -1, 0, 1 } },
+     { 1, -1, 0, 1 },
+     { 0, 1, 0, 1 }
+   },
 
    { {-0.9,  0.0, 0.0, 1.0 },
      { 0, 1, 0, 1 },
-     { -1, 0, 0, 1 } },
+     { -1, 0, 0, 1 },
+     { 0, 0, 1, 1 }
+   },
 };
 
 static struct vertex vertices_strip[] =
 {
    { { 0.9, 0.9, 0.0, 1.0 },
      { 0, 0, 1, 1 },
-     { 1, 1, 0, 1 } },
+     { 1, 1, 0, 1 },
+     { 1, 0, 0, 1 }
+   },
 
    { { 0.9,  -0.9, 0.0, 1.0 },
      { 1, 0, 0, 1 },
-     { 1, -1, 0, 1 } },
-
-   { {-0.9,  -0.9, 0.0, 1.0 },
-     { 0, 1, 0, 1 },
-     { -1, -1, 0, 1 } },
+     { 1, -1, 0, 1 },
+     { 0, 1, 0, 1 }
+   },
 
    { {-0.9,  0.9, 0.0, 1.0 },
+     { 0, 1, 0, 1 },
+     { -1, 1, 0, 1 },
+     { 0, 0, 1, 1 }
+   },
+
+   { {-0.9,  -0.9, 0.0, 1.0 },
      { 1, 1, 0, 1 },
-     { -1, 1, 0, 1 } },
+     { -1, -1, 0, 1 },
+     { 1, 1, 0, 1 }
+   },
 };
 
-static float constants[] = 
+static float constants1[] =
 {  0.4, 0, 0,  1,
    1,   1, 1,  1,
    2,   2, 2,  2,
@@ -111,6 +127,25 @@
    0, 0, 0, 1,
 };
 
+
+static float constants2[] =
+{  1, 0, 0,  1,
+   0, 1, 0,  1,
+   0, 0, 1,  1,
+   0, 0, 0,  1,
+
+   1,  1, 0, 1,
+   1, .5, 0, 1,
+   0,  1, 1, 1,
+   1,  0, 1, 1,
+
+   1, 0, 0, 0.5,
+   0, 1, 0, 0.5,
+   0, 0, 1, 0,
+   0, 0, 0, 1,
+};
+
+
 static void init_fs_constbuf( void )
 {
    struct pipe_resource templat;
@@ -118,34 +153,54 @@
 
    templat.target = PIPE_BUFFER;
    templat.format = PIPE_FORMAT_R8_UNORM;
-   templat.width0 = sizeof(constants);
+   templat.width0 = sizeof(constants1);
    templat.height0 = 1;
    templat.depth0 = 1;
    templat.last_level = 0;
    templat.nr_samples = 1;
    templat.bind = PIPE_BIND_CONSTANT_BUFFER;
 
-   constbuf = screen->resource_create(screen,
-                                      &templat);
-   if (constbuf == NULL)
+   constbuf1 = screen->resource_create(screen, &templat);
+   if (constbuf1 == NULL)
+      exit(4);
+   constbuf2 = screen->resource_create(screen, &templat);
+   if (constbuf2 == NULL)
       exit(4);
 
+   {
+      u_box_2d(0,0,sizeof(constants1),1, &box);
 
-   u_box_2d(0,0,sizeof(constants),1, &box);
-
-   ctx->transfer_inline_write(ctx,
-                              constbuf,
-                              u_subresource(0,0),
-                              PIPE_TRANSFER_WRITE,
-                              &box,
-                              constants,
-                              sizeof constants,
-                              sizeof constants);
+      ctx->transfer_inline_write(ctx,
+                                 constbuf1,
+                                 u_subresource(0,0),
+                                 PIPE_TRANSFER_WRITE,
+                                 &box,
+                                 constants1,
+                                 sizeof constants1,
+                                 sizeof constants1);
 
 
-   ctx->set_constant_buffer(ctx,
-                            PIPE_SHADER_FRAGMENT, 0,
-                            constbuf);
+      ctx->set_constant_buffer(ctx,
+                               PIPE_SHADER_GEOMETRY, 0,
+                               constbuf1);
+   }
+   {
+      u_box_2d(0,0,sizeof(constants2),1, &box);
+
+      ctx->transfer_inline_write(ctx,
+                                 constbuf2,
+                                 u_subresource(0,0),
+                                 PIPE_TRANSFER_WRITE,
+                                 &box,
+                                 constants2,
+                                 sizeof constants2,
+                                 sizeof constants2);
+
+
+      ctx->set_constant_buffer(ctx,
+                               PIPE_SHADER_GEOMETRY, 1,
+                               constbuf2);
+   }
 }
 
 
@@ -174,7 +229,7 @@
 
 static void set_vertices( void )
 {
-   struct pipe_vertex_element ve[3];
+   struct pipe_vertex_element ve[4];
    struct pipe_vertex_buffer vbuf;
    void *handle;
 
@@ -186,11 +241,12 @@
    ve[1].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
    ve[2].src_offset = Offset(struct vertex, texcoord);
    ve[2].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+   ve[3].src_offset = Offset(struct vertex, generic);
+   ve[3].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
 
-   handle = ctx->create_vertex_elements_state(ctx, 3, ve);
+   handle = ctx->create_vertex_elements_state(ctx, 4, ve);
    ctx->bind_vertex_elements_state(ctx, handle);
 
-
    vbuf.stride = sizeof( struct vertex );
    vbuf.buffer_offset = 0;
    if (draw_strip) {
@@ -218,12 +274,15 @@
       "DCL IN[0]\n"
       "DCL IN[1]\n"
       "DCL IN[2]\n"
+      "DCL IN[3]\n"
       "DCL OUT[0], POSITION\n"
       "DCL OUT[1], COLOR[0]\n"
       "DCL OUT[2], GENERIC[0]\n"
+      "DCL OUT[3], GENERIC[1]\n"
       "  MOV OUT[0], IN[0]\n"
       "  MOV OUT[1], IN[1]\n"
       "  MOV OUT[2], IN[2]\n"
+      "  MOV OUT[3], IN[3]\n"
       "  END\n";
 
    handle = graw_parse_vertex_shader(ctx, text);
diff --git a/src/gallium/tests/python/tests/regress/fragment-shader/frag-cb-2d.sh b/src/gallium/tests/python/tests/regress/fragment-shader/frag-cb-2d.sh
index f70a514..bbc3a10 100644
--- a/src/gallium/tests/python/tests/regress/fragment-shader/frag-cb-2d.sh
+++ b/src/gallium/tests/python/tests/regress/fragment-shader/frag-cb-2d.sh
@@ -2,8 +2,8 @@
 
 DCL IN[0], COLOR, LINEAR
 DCL OUT[0], COLOR
-DCL CONST[1][1..2]
+DCL CONST[1][6]
 
-MAD OUT[0], IN[0], CONST[1][2], CONST[1][1]
+MOV OUT[0], CONST[1][6]
 
 END
diff --git a/src/gallium/tests/python/tests/regress/fragment-shader/frag-mad-immx.sh b/src/gallium/tests/python/tests/regress/fragment-shader/frag-mad-immx.sh
new file mode 100644
index 0000000..6b03491
--- /dev/null
+++ b/src/gallium/tests/python/tests/regress/fragment-shader/frag-mad-immx.sh
@@ -0,0 +1,10 @@
+FRAG
+
+DCL IN[0], COLOR, LINEAR
+DCL OUT[0], COLOR
+DCL IMMX[0..1]  {{ 0.5, 0.4, 0.6, 1.0 },
+                 { 0.5, 0.4, 0.6, 0.0 }}
+
+MAD OUT[0], IN[0], IMMX[0], IMMX[1]
+
+END
diff --git a/src/gallium/tests/python/tests/regress/fragment-shader/frag-tempx.sh b/src/gallium/tests/python/tests/regress/fragment-shader/frag-tempx.sh
new file mode 100644
index 0000000..81bcad2
--- /dev/null
+++ b/src/gallium/tests/python/tests/regress/fragment-shader/frag-tempx.sh
@@ -0,0 +1,14 @@
+FRAG
+
+DCL IN[0], COLOR, LINEAR
+DCL OUT[0], COLOR
+
+DCL TEMPX[0][0..1]
+
+IMM FLT32 { -0.5, -0.4, -0.6, 0.0 }
+
+ADD TEMPX[0][0], IN[0], IMM[0]
+ADD TEMPX[0][1], IN[0], IMM[0]
+ABS OUT[0], TEMPX[0][1]
+
+END
diff --git a/src/gallium/winsys/r600/drm/r600_drm.c b/src/gallium/winsys/r600/drm/r600_drm.c
index b772ff0..803049d 100644
--- a/src/gallium/winsys/r600/drm/r600_drm.c
+++ b/src/gallium/winsys/r600/drm/r600_drm.c
@@ -48,19 +48,22 @@
 			       struct winsys_handle *whandle)
 {
 	struct drm_gem_flink flink;
-	struct r600_buffer* rbuffer;
-	int r;
+	struct r600_buffer* rbuffer = (struct r600_buffer*)buf;
 
-	rbuffer = (struct r600_buffer*)buf;
-	if (!rbuffer->flink) {
-		flink.handle = rbuffer->bo->handle;
-		r = ioctl(rw->fd, DRM_IOCTL_GEM_FLINK, &flink);
-		if (r) {
-			return FALSE;
+	if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+		if (!rbuffer->flink) {
+			flink.handle = rbuffer->bo->handle;
+
+			if (ioctl(rw->fd, DRM_IOCTL_GEM_FLINK, &flink)) {
+				return FALSE;
+			}
+
+			rbuffer->flink = flink.name;
 		}
-		rbuffer->flink = flink.name;
+		whandle->handle = rbuffer->flink;
+	} else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
+		whandle->handle = rbuffer->bo->handle;
 	}
-	whandle->handle = rbuffer->flink;
 	return TRUE;
 }
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm.c b/src/gallium/winsys/radeon/drm/radeon_drm.c
index 59f1b10..a9ae09c 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm.c
@@ -34,6 +34,8 @@
 #include "radeon_buffer.h"
 
 #include "r300_winsys.h"
+
+#include "galahad/glhd_drm.h"
 #include "trace/tr_drm.h"
 
 #include "util/u_memory.h"
@@ -188,5 +190,5 @@
 
 struct drm_api* drm_api_create()
 {
-    return trace_drm_create(&radeon_drm_api_hooks);
+    return galahad_drm_create(trace_drm_create(&radeon_drm_api_hooks));
 }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_buffer.c b/src/gallium/winsys/radeon/drm/radeon_drm_buffer.c
index ee1b9ed..a4b6cff 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_buffer.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_buffer.c
@@ -267,16 +267,14 @@
 boolean radeon_drm_bufmgr_get_handle(struct pb_buffer *_buf,
 				     struct winsys_handle *whandle)
 {
-    int retval, fd;
     struct drm_gem_flink flink;
     struct radeon_drm_buffer *buf = get_drm_buffer(_buf);
+
     if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
 	if (!buf->flinked) {
-	    fd = buf->mgr->rws->fd;
 	    flink.handle = buf->bo->handle;
 
-	    retval = ioctl(fd, DRM_IOCTL_GEM_FLINK, &flink);
-	    if (retval) {
+            if (ioctl(buf->mgr->rws->fd, DRM_IOCTL_GEM_FLINK, &flink)) {
 		return FALSE;
 	    }
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_r300.c b/src/gallium/winsys/radeon/drm/radeon_r300.c
index 70ae01a..d2d317d 100644
--- a/src/gallium/winsys/radeon/drm/radeon_r300.c
+++ b/src/gallium/winsys/radeon/drm/radeon_r300.c
@@ -136,29 +136,21 @@
 }
 
 static struct r300_winsys_buffer *radeon_r300_winsys_buffer_from_handle(struct r300_winsys_screen *rws,
-									struct pipe_screen *screen,
-									struct winsys_handle *whandle,
-									unsigned *stride)
+                                                                        unsigned handle)
 {
     struct radeon_libdrm_winsys *ws = radeon_winsys_screen(rws);
     struct pb_buffer *_buf;
 
-    _buf = radeon_drm_bufmgr_create_buffer_from_handle(ws->kman, whandle->handle);
-    *stride = whandle->stride;
+    _buf = radeon_drm_bufmgr_create_buffer_from_handle(ws->kman, handle);
     return radeon_libdrm_winsys_buffer(_buf);
 }
 
 static boolean radeon_r300_winsys_buffer_get_handle(struct r300_winsys_screen *rws,
 						    struct r300_winsys_buffer *buffer,
-						    unsigned stride,
-						    struct winsys_handle *whandle)
+                                                    struct winsys_handle *whandle)
 {
     struct pb_buffer *_buf = radeon_pb_buffer(buffer);
-    boolean ret;
-    ret = radeon_drm_bufmgr_get_handle(_buf, whandle);
-    if (ret)
-	whandle->stride = stride;
-    return ret;
+    return radeon_drm_bufmgr_get_handle(_buf, whandle);
 }
 
 static void radeon_set_flush_cb(struct r300_winsys_screen *rws,
@@ -192,25 +184,23 @@
     return TRUE;
 }
 
-static void radeon_get_cs_info(struct r300_winsys_screen *rws,
-                               struct r300_cs_info *info)
+static unsigned radeon_get_cs_free_dwords(struct r300_winsys_screen *rws)
 {
     struct radeon_libdrm_winsys *ws = radeon_winsys_screen(rws);
     struct radeon_cs *cs = ws->cs;
 
-    info->capacity = cs->ndw;
-    info->used = cs->cdw;
-    info->free = cs->ndw - cs->cdw;
+    return cs->ndw - cs->cdw;
 }
 
-static void radeon_begin_cs(struct r300_winsys_screen *rws,
-                            int size,
-                            const char* file,
-                            const char* function,
-                            int line)
+static uint32_t *radeon_get_cs_pointer(struct r300_winsys_screen *rws,
+                                       unsigned count)
 {
     struct radeon_libdrm_winsys *ws = radeon_winsys_screen(rws);
-    radeon_cs_begin(ws->cs, size, file, function, line);
+    struct radeon_cs *cs = ws->cs;
+    uint32_t *ptr = cs->packets + cs->cdw;
+
+    cs->cdw += count;
+    return ptr;
 }
 
 static void radeon_write_cs_dword(struct r300_winsys_screen *rws,
@@ -243,15 +233,6 @@
     radeon_cs_space_reset_bos(ws->cs);
 }
 
-static void radeon_end_cs(struct r300_winsys_screen *rws,
-                          const char* file,
-                          const char* function,
-                          int line)
-{
-    struct radeon_libdrm_winsys *ws = radeon_winsys_screen(rws);
-    radeon_cs_end(ws->cs, file, function, line);
-}
-
 static void radeon_flush_cs(struct r300_winsys_screen *rws)
 {
     struct radeon_libdrm_winsys *ws = radeon_winsys_screen(rws);
@@ -345,12 +326,11 @@
     ws->base.add_buffer = radeon_add_buffer;
     ws->base.validate = radeon_validate;
     ws->base.destroy = radeon_winsys_destroy;
-    ws->base.get_cs_info = radeon_get_cs_info;
-    ws->base.begin_cs = radeon_begin_cs;
+    ws->base.get_cs_free_dwords = radeon_get_cs_free_dwords;
+    ws->base.get_cs_pointer = radeon_get_cs_pointer;
     ws->base.write_cs_dword = radeon_write_cs_dword;
     ws->base.write_cs_table = radeon_write_cs_table;
     ws->base.write_cs_reloc = radeon_write_cs_reloc;
-    ws->base.end_cs = radeon_end_cs;
     ws->base.flush_cs = radeon_flush_cs;
     ws->base.reset_bos = radeon_reset_bos;
     ws->base.set_flush_cb = radeon_set_flush_cb;
diff --git a/src/gallium/winsys/sw/fbdev/Makefile b/src/gallium/winsys/sw/fbdev/Makefile
new file mode 100644
index 0000000..8832aab
--- /dev/null
+++ b/src/gallium/winsys/sw/fbdev/Makefile
@@ -0,0 +1,13 @@
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+LIBNAME = fbdev
+
+LIBRARY_INCLUDES =
+
+LIBRARY_DEFINES =
+
+C_SOURCES = \
+	fbdev_sw_winsys.c
+
+include ../../../Makefile.template
diff --git a/src/gallium/winsys/sw/fbdev/SConscript b/src/gallium/winsys/sw/fbdev/SConscript
new file mode 100644
index 0000000..3b5b4ff
--- /dev/null
+++ b/src/gallium/winsys/sw/fbdev/SConscript
@@ -0,0 +1,23 @@
+#######################################################################
+# SConscript for fbdev winsys
+
+
+Import('*')
+
+if env['platform'] == 'linux':
+
+    env = env.Clone()
+
+    env.Append(CPPPATH = [
+        '#/src/gallium/include',
+        '#/src/gallium/auxiliary',
+        '#/src/gallium/drivers',
+    ])
+
+    ws_fbdev = env.ConvenienceLibrary(
+        target = 'ws_fbdev',
+        source = [
+           'fbdev_sw_winsys.c',
+        ]
+    )
+    Export('ws_fbdev')
diff --git a/src/gallium/winsys/sw/fbdev/fbdev_sw_winsys.c b/src/gallium/winsys/sw/fbdev/fbdev_sw_winsys.c
new file mode 100644
index 0000000..f4f4cd7
--- /dev/null
+++ b/src/gallium/winsys/sw/fbdev/fbdev_sw_winsys.c
@@ -0,0 +1,224 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <linux/fb.h>
+
+#include "pipe/p_compiler.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "state_tracker/sw_winsys.h"
+
+#include "fbdev_sw_winsys.h"
+
+struct fbdev_sw_displaytarget
+{
+   enum pipe_format format;
+   unsigned width;
+   unsigned height;
+   unsigned stride;
+
+   void *data;
+   void *mapped;
+};
+
+struct fbdev_sw_winsys
+{
+   struct sw_winsys base;
+
+   int fd;
+   enum pipe_format format;
+
+   struct fb_fix_screeninfo finfo;
+   void *fbmem;
+   unsigned rows;
+   unsigned stride;
+};
+
+static INLINE struct fbdev_sw_displaytarget *
+fbdev_sw_displaytarget(struct sw_displaytarget *dt)
+{
+   return (struct fbdev_sw_displaytarget *) dt;
+}
+
+static INLINE struct fbdev_sw_winsys *
+fbdev_sw_winsys(struct sw_winsys *ws)
+{
+   return (struct fbdev_sw_winsys *) ws;
+}
+
+static void
+fbdev_displaytarget_display(struct sw_winsys *ws,
+                            struct sw_displaytarget *dt,
+                            void *context_private)
+{
+   struct fbdev_sw_winsys *fbdev = fbdev_sw_winsys(ws);
+   struct fbdev_sw_displaytarget *fbdt = fbdev_sw_displaytarget(dt);
+   unsigned rows, len, i;
+
+   rows = MIN2(fbdt->height, fbdev->rows);
+   len = util_format_get_stride(fbdt->format, fbdt->width);
+   len = MIN2(len, fbdev->stride);
+
+   for (i = 0; i < rows; i++) {
+      void *dst = fbdev->fbmem + fbdev->stride * i;
+      void *src = fbdt->data + fbdt->stride * i;
+
+      memcpy(dst, src, len);
+   }
+}
+
+static void
+fbdev_displaytarget_unmap(struct sw_winsys *ws,
+                           struct sw_displaytarget *dt)
+{
+   struct fbdev_sw_displaytarget *fbdt = fbdev_sw_displaytarget(dt);
+   fbdt->mapped = NULL;
+}
+
+static void *
+fbdev_displaytarget_map(struct sw_winsys *ws,
+                        struct sw_displaytarget *dt,
+                        unsigned flags)
+{
+   struct fbdev_sw_displaytarget *fbdt = fbdev_sw_displaytarget(dt);
+   fbdt->mapped = fbdt->data;
+   return fbdt->mapped;
+}
+
+static void
+fbdev_displaytarget_destroy(struct sw_winsys *ws,
+                            struct sw_displaytarget *dt)
+{
+   struct fbdev_sw_displaytarget *fbdt = fbdev_sw_displaytarget(dt);
+
+   if (fbdt->data)
+      align_free(fbdt->data);
+
+   FREE(fbdt);
+}
+
+static struct sw_displaytarget *
+fbdev_displaytarget_create(struct sw_winsys *ws,
+                           unsigned tex_usage,
+                           enum pipe_format format,
+                           unsigned width, unsigned height,
+                           unsigned alignment,
+                           unsigned *stride)
+{
+   struct fbdev_sw_winsys *fbdev = fbdev_sw_winsys(ws);
+   struct fbdev_sw_displaytarget *fbdt;
+   unsigned nblocksy, size, format_stride;
+
+   if (fbdev->format != format)
+      return NULL;
+
+   fbdt = CALLOC_STRUCT(fbdev_sw_displaytarget);
+   if (!fbdt)
+      return NULL;
+
+   fbdt->format = format;
+   fbdt->width = width;
+   fbdt->height = height;
+
+   format_stride = util_format_get_stride(format, width);
+   fbdt->stride = align(format_stride, alignment);
+
+   nblocksy = util_format_get_nblocksy(format, height);
+   size = fbdt->stride * nblocksy;
+
+   fbdt->data = align_malloc(size, alignment);
+   if (!fbdt->data) {
+      FREE(fbdt);
+      return NULL;
+   }
+
+   *stride = fbdt->stride;
+
+   return (struct sw_displaytarget *) fbdt;
+}
+
+static boolean
+fbdev_is_displaytarget_format_supported(struct sw_winsys *ws,
+                                        unsigned tex_usage,
+                                        enum pipe_format format)
+{
+   struct fbdev_sw_winsys *fbdev = fbdev_sw_winsys(ws);
+   return (fbdev->format == format);
+}
+
+static void
+fbdev_destroy(struct sw_winsys *ws)
+{
+   struct fbdev_sw_winsys *fbdev = fbdev_sw_winsys(ws);
+
+   munmap(fbdev->fbmem, fbdev->finfo.smem_len);
+   FREE(fbdev);
+}
+
+struct sw_winsys *
+fbdev_create_sw_winsys(int fd, enum pipe_format format)
+{
+   struct fbdev_sw_winsys *fbdev;
+
+   fbdev = CALLOC_STRUCT(fbdev_sw_winsys);
+   if (!fbdev)
+      return NULL;
+
+   fbdev->fd = fd;
+   fbdev->format = format;
+   if (ioctl(fbdev->fd, FBIOGET_FSCREENINFO, &fbdev->finfo)) {
+      FREE(fbdev);
+      return NULL;
+   }
+
+   fbdev->fbmem = mmap(0, fbdev->finfo.smem_len,
+         PROT_WRITE, MAP_SHARED, fbdev->fd, 0);
+   if (fbdev->fbmem == MAP_FAILED) {
+      FREE(fbdev);
+      return NULL;
+   }
+
+   fbdev->rows = fbdev->finfo.smem_len / fbdev->finfo.line_length;
+   fbdev->stride = fbdev->finfo.line_length;
+
+   fbdev->base.destroy = fbdev_destroy;
+   fbdev->base.is_displaytarget_format_supported =
+      fbdev_is_displaytarget_format_supported;
+
+   fbdev->base.displaytarget_create = fbdev_displaytarget_create;
+   fbdev->base.displaytarget_destroy = fbdev_displaytarget_destroy;
+   fbdev->base.displaytarget_map = fbdev_displaytarget_map;
+   fbdev->base.displaytarget_unmap = fbdev_displaytarget_unmap;
+
+   fbdev->base.displaytarget_display = fbdev_displaytarget_display;
+
+   return &fbdev->base;
+}
diff --git a/src/gallium/winsys/sw/fbdev/fbdev_sw_winsys.h b/src/gallium/winsys/sw/fbdev/fbdev_sw_winsys.h
new file mode 100644
index 0000000..d958ab9
--- /dev/null
+++ b/src/gallium/winsys/sw/fbdev/fbdev_sw_winsys.h
@@ -0,0 +1,38 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.8
+ *
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef FBDEV_SW_WINSYS
+#define FBDEV_SW_WINSYS
+
+struct sw_winsys;
+enum pipe_format;
+
+struct sw_winsys *
+fbdev_create_sw_winsys(int fd, enum pipe_format format);
+
+#endif /* FBDEV_SW_WINSYS */
diff --git a/src/mapi/mapi/u_execmem.c b/src/mapi/mapi/u_execmem.c
index 00df830..e5072e0 100644
--- a/src/mapi/mapi/u_execmem.c
+++ b/src/mapi/mapi/u_execmem.c
@@ -105,6 +105,8 @@
 
 #else
 
+#include <stdlib.h>
+
 static int
 init_map(void)
 {
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 629ec0f..c548e10 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -2400,6 +2400,9 @@
          break;
       }
 
+      /* Set MaxLevel large enough to hold the new level when we allocate it  */
+      _mesa_TexParameteri(target, GL_TEXTURE_MAX_LEVEL, dstLevel);
+
       /* Create empty dest image */
       if (target == GL_TEXTURE_1D) {
          _mesa_TexImage1D(target, dstLevel, srcImage->InternalFormat,
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index a0039e8..8319815 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -61,6 +61,7 @@
 	brw_sf.c \
 	brw_sf_emit.c \
 	brw_sf_state.c \
+	brw_state.c \
 	brw_state_batch.c \
 	brw_state_cache.c \
 	brw_state_dump.c \
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index c9e42a1..cfce5d3 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -36,7 +36,8 @@
 #include "brw_util.h"
 #include "main/macros.h"
 
-static void prepare_cc_vp( struct brw_context *brw )
+void
+brw_update_cc_vp(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct brw_cc_viewport ccv;
@@ -54,40 +55,9 @@
    }
 
    drm_intel_bo_unreference(brw->cc.vp_bo);
-   brw->cc.vp_bo = brw_cache_data(&brw->cache, BRW_CC_VP, &ccv, sizeof(ccv),
-				  NULL, 0);
+   brw->cc.vp_bo = brw_cache_data(&brw->cache, BRW_CC_VP, &ccv, sizeof(ccv));
 }
 
-const struct brw_tracked_state brw_cc_vp = {
-   .dirty = {
-      .mesa = _NEW_VIEWPORT | _NEW_TRANSFORM,
-      .brw = BRW_NEW_CONTEXT,
-      .cache = 0
-   },
-   .prepare = prepare_cc_vp
-};
-
-struct brw_cc_unit_key {
-   GLboolean stencil, stencil_two_side, color_blend, alpha_enabled;
-
-   GLenum stencil_func[2], stencil_fail_op[2];
-   GLenum stencil_pass_depth_fail_op[2], stencil_pass_depth_pass_op[2];
-   GLubyte stencil_ref[2], stencil_write_mask[2], stencil_test_mask[2];
-   GLenum logic_op;
-
-   GLenum blend_eq_rgb, blend_eq_a;
-   GLenum blend_src_rgb, blend_src_a;
-   GLenum blend_dst_rgb, blend_dst_a;
-
-   GLenum alpha_func;
-   GLclampf alpha_ref;
-
-   GLboolean dither;
-
-   GLboolean depth_test, depth_write;
-   GLenum depth_func;
-};
-
 /**
  * Modify blend function to force destination alpha to 1.0
  *
@@ -110,136 +80,83 @@
    return function;
 }
 
-static void
-cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
+static void prepare_cc_unit(struct brw_context *brw)
+{
+   brw_add_validated_bo(brw, brw->cc.vp_bo);
+}
+
+/**
+ * Creates the state cache entry for the given CC unit key.
+ */
+static void upload_cc_unit(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
-   const unsigned back = ctx->Stencil._BackFace;
+   struct brw_cc_unit_state cc;
+   void *map;
 
-   memset(key, 0, sizeof(*key));
+   memset(&cc, 0, sizeof(cc));
 
-   key->stencil = ctx->Stencil._Enabled;
-   key->stencil_two_side = ctx->Stencil._TestTwoSide;
+   /* _NEW_STENCIL */
+   if (ctx->Stencil._Enabled) {
+      const unsigned back = ctx->Stencil._BackFace;
 
-   if (key->stencil) {
-      key->stencil_func[0] = ctx->Stencil.Function[0];
-      key->stencil_fail_op[0] = ctx->Stencil.FailFunc[0];
-      key->stencil_pass_depth_fail_op[0] = ctx->Stencil.ZFailFunc[0];
-      key->stencil_pass_depth_pass_op[0] = ctx->Stencil.ZPassFunc[0];
-      key->stencil_ref[0] = ctx->Stencil.Ref[0];
-      key->stencil_write_mask[0] = ctx->Stencil.WriteMask[0];
-      key->stencil_test_mask[0] = ctx->Stencil.ValueMask[0];
-   }
-   if (key->stencil_two_side) {
-      key->stencil_func[1] = ctx->Stencil.Function[back];
-      key->stencil_fail_op[1] = ctx->Stencil.FailFunc[back];
-      key->stencil_pass_depth_fail_op[1] = ctx->Stencil.ZFailFunc[back];
-      key->stencil_pass_depth_pass_op[1] = ctx->Stencil.ZPassFunc[back];
-      key->stencil_ref[1] = ctx->Stencil.Ref[back];
-      key->stencil_write_mask[1] = ctx->Stencil.WriteMask[back];
-      key->stencil_test_mask[1] = ctx->Stencil.ValueMask[back];
+      cc.cc0.stencil_enable = 1;
+      cc.cc0.stencil_func =
+	 intel_translate_compare_func(ctx->Stencil.Function[0]);
+      cc.cc0.stencil_fail_op =
+	 intel_translate_stencil_op(ctx->Stencil.FailFunc[0]);
+      cc.cc0.stencil_pass_depth_fail_op =
+	 intel_translate_stencil_op(ctx->Stencil.ZFailFunc[0]);
+      cc.cc0.stencil_pass_depth_pass_op =
+	 intel_translate_stencil_op(ctx->Stencil.ZPassFunc[0]);
+      cc.cc1.stencil_ref = ctx->Stencil.Ref[0];
+      cc.cc1.stencil_write_mask = ctx->Stencil.WriteMask[0];
+      cc.cc1.stencil_test_mask = ctx->Stencil.ValueMask[0];
+
+      if (ctx->Stencil._TestTwoSide) {
+	 cc.cc0.bf_stencil_enable = 1;
+	 cc.cc0.bf_stencil_func =
+	    intel_translate_compare_func(ctx->Stencil.Function[back]);
+	 cc.cc0.bf_stencil_fail_op =
+	    intel_translate_stencil_op(ctx->Stencil.FailFunc[back]);
+	 cc.cc0.bf_stencil_pass_depth_fail_op =
+	    intel_translate_stencil_op(ctx->Stencil.ZFailFunc[back]);
+	 cc.cc0.bf_stencil_pass_depth_pass_op =
+	    intel_translate_stencil_op(ctx->Stencil.ZPassFunc[back]);
+	 cc.cc1.bf_stencil_ref = ctx->Stencil.Ref[back];
+	 cc.cc2.bf_stencil_write_mask = ctx->Stencil.WriteMask[back];
+	 cc.cc2.bf_stencil_test_mask = ctx->Stencil.ValueMask[back];
+      }
+
+      /* Not really sure about this:
+       */
+      if (ctx->Stencil.WriteMask[0] ||
+	  (ctx->Stencil._TestTwoSide && ctx->Stencil.WriteMask[back]))
+	 cc.cc0.stencil_write_enable = 1;
    }
 
-   if (ctx->Color._LogicOpEnabled)
-      key->logic_op = ctx->Color.LogicOp;
-   else
-      key->logic_op = GL_COPY;
-
-   key->color_blend = ctx->Color.BlendEnabled;
-   if (key->color_blend) {
-      key->blend_eq_rgb = ctx->Color.BlendEquationRGB;
-      key->blend_eq_a = ctx->Color.BlendEquationA;
-      key->blend_src_rgb = ctx->Color.BlendSrcRGB;
-      key->blend_dst_rgb = ctx->Color.BlendDstRGB;
-      key->blend_src_a = ctx->Color.BlendSrcA;
-      key->blend_dst_a = ctx->Color.BlendDstA;
+   /* _NEW_COLOR */
+   if (ctx->Color._LogicOpEnabled && ctx->Color.LogicOp != GL_COPY) {
+      cc.cc2.logicop_enable = 1;
+      cc.cc5.logicop_func = intel_translate_logic_op(ctx->Color.LogicOp);
+   } else if (ctx->Color.BlendEnabled) {
+      GLenum eqRGB = ctx->Color.BlendEquationRGB;
+      GLenum eqA = ctx->Color.BlendEquationA;
+      GLenum srcRGB = ctx->Color.BlendSrcRGB;
+      GLenum dstRGB = ctx->Color.BlendDstRGB;
+      GLenum srcA = ctx->Color.BlendSrcA;
+      GLenum dstA = ctx->Color.BlendDstA;
 
       /* If the renderbuffer is XRGB, we have to frob the blend function to
        * force the destination alpha to 1.0.  This means replacing GL_DST_ALPHA
        * with GL_ONE and GL_ONE_MINUS_DST_ALPHA with GL_ZERO.
        */
       if (ctx->DrawBuffer->Visual.alphaBits == 0) {
-	 key->blend_src_rgb = fix_xRGB_alpha(key->blend_src_rgb);
-	 key->blend_src_a   = fix_xRGB_alpha(key->blend_src_a);
-	 key->blend_dst_rgb = fix_xRGB_alpha(key->blend_dst_rgb);
-	 key->blend_dst_a   = fix_xRGB_alpha(key->blend_dst_a);
+	 srcRGB = fix_xRGB_alpha(srcRGB);
+	 srcA   = fix_xRGB_alpha(srcA);
+	 dstRGB = fix_xRGB_alpha(dstRGB);
+	 dstA   = fix_xRGB_alpha(dstA);
       }
-   }
-
-   key->alpha_enabled = ctx->Color.AlphaEnabled;
-   if (key->alpha_enabled) {
-      key->alpha_func = ctx->Color.AlphaFunc;
-      key->alpha_ref = ctx->Color.AlphaRef;
-   }
-
-   key->dither = ctx->Color.DitherFlag;
-
-   key->depth_test = ctx->Depth.Test;
-   if (key->depth_test) {
-      key->depth_func = ctx->Depth.Func;
-      key->depth_write = ctx->Depth.Mask;
-   }
-}
-
-/**
- * Creates the state cache entry for the given CC unit key.
- */
-static drm_intel_bo *
-cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
-{
-   struct brw_cc_unit_state cc;
-   drm_intel_bo *bo;
-
-   memset(&cc, 0, sizeof(cc));
-
-   /* _NEW_STENCIL */
-   if (key->stencil) {
-      cc.cc0.stencil_enable = 1;
-      cc.cc0.stencil_func =
-	 intel_translate_compare_func(key->stencil_func[0]);
-      cc.cc0.stencil_fail_op =
-	 intel_translate_stencil_op(key->stencil_fail_op[0]);
-      cc.cc0.stencil_pass_depth_fail_op =
-	 intel_translate_stencil_op(key->stencil_pass_depth_fail_op[0]);
-      cc.cc0.stencil_pass_depth_pass_op =
-	 intel_translate_stencil_op(key->stencil_pass_depth_pass_op[0]);
-      cc.cc1.stencil_ref = key->stencil_ref[0];
-      cc.cc1.stencil_write_mask = key->stencil_write_mask[0];
-      cc.cc1.stencil_test_mask = key->stencil_test_mask[0];
-
-      if (key->stencil_two_side) {
-	 cc.cc0.bf_stencil_enable = 1;
-	 cc.cc0.bf_stencil_func =
-	    intel_translate_compare_func(key->stencil_func[1]);
-	 cc.cc0.bf_stencil_fail_op =
-	    intel_translate_stencil_op(key->stencil_fail_op[1]);
-	 cc.cc0.bf_stencil_pass_depth_fail_op =
-	    intel_translate_stencil_op(key->stencil_pass_depth_fail_op[1]);
-	 cc.cc0.bf_stencil_pass_depth_pass_op =
-	    intel_translate_stencil_op(key->stencil_pass_depth_pass_op[1]);
-	 cc.cc1.bf_stencil_ref = key->stencil_ref[1];
-	 cc.cc2.bf_stencil_write_mask = key->stencil_write_mask[1];
-	 cc.cc2.bf_stencil_test_mask = key->stencil_test_mask[1];
-      }
-
-      /* Not really sure about this:
-       */
-      if (key->stencil_write_mask[0] ||
-	  (key->stencil_two_side && key->stencil_write_mask[1]))
-	 cc.cc0.stencil_write_enable = 1;
-   }
-
-   /* _NEW_COLOR */
-   if (key->logic_op != GL_COPY) {
-      cc.cc2.logicop_enable = 1;
-      cc.cc5.logicop_func = intel_translate_logic_op(key->logic_op);
-   } else if (key->color_blend) {
-      GLenum eqRGB = key->blend_eq_rgb;
-      GLenum eqA = key->blend_eq_a;
-      GLenum srcRGB = key->blend_src_rgb;
-      GLenum dstRGB = key->blend_dst_rgb;
-      GLenum srcA = key->blend_src_a;
-      GLenum dstA = key->blend_dst_a;
 
       if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
 	 srcRGB = dstRGB = GL_ONE;
@@ -263,25 +180,27 @@
 				eqA != eqRGB);
    }
 
-   if (key->alpha_enabled) {
+   if (ctx->Color.AlphaEnabled) {
       cc.cc3.alpha_test = 1;
-      cc.cc3.alpha_test_func = intel_translate_compare_func(key->alpha_func);
+      cc.cc3.alpha_test_func =
+	 intel_translate_compare_func(ctx->Color.AlphaFunc);
       cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
 
-      UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], key->alpha_ref);
+      UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], ctx->Color.AlphaRef);
    }
 
-   if (key->dither) {
+   if (ctx->Color.DitherFlag) {
       cc.cc5.dither_enable = 1;
       cc.cc6.y_dither_offset = 0;
       cc.cc6.x_dither_offset = 0;
    }
 
    /* _NEW_DEPTH */
-   if (key->depth_test) {
+   if (ctx->Depth.Test) {
       cc.cc2.depth_test = 1;
-      cc.cc2.depth_test_function = intel_translate_compare_func(key->depth_func);
-      cc.cc2.depth_write_enable = key->depth_write;
+      cc.cc2.depth_test_function =
+	 intel_translate_compare_func(ctx->Depth.Func);
+      cc.cc2.depth_write_enable = ctx->Depth.Mask;
    }
 
    /* CACHE_NEW_CC_VP */
@@ -290,43 +209,25 @@
    if (INTEL_DEBUG & DEBUG_STATS)
       cc.cc5.statistics_enable = 1;
 
-   bo = brw_upload_cache(&brw->cache, BRW_CC_UNIT,
-			 key, sizeof(*key),
-			 &brw->cc.vp_bo, 1,
-			 &cc, sizeof(cc));
+   map = brw_state_batch(brw, sizeof(cc), 64,
+			 &brw->cc.state_bo, &brw->cc.state_offset);
+   memcpy(map, &cc, sizeof(cc));
+   brw->state.dirty.cache |= CACHE_NEW_CC_UNIT;
 
    /* Emit CC viewport relocation */
-   drm_intel_bo_emit_reloc(bo, offsetof(struct brw_cc_unit_state, cc4),
+   drm_intel_bo_emit_reloc(brw->cc.state_bo, (brw->cc.state_offset +
+					      offsetof(struct brw_cc_unit_state,
+						       cc4)),
 			   brw->cc.vp_bo, 0,
 			   I915_GEM_DOMAIN_INSTRUCTION, 0);
-
-   return bo;
-}
-
-static void prepare_cc_unit( struct brw_context *brw )
-{
-   struct brw_cc_unit_key key;
-
-   cc_unit_populate_key(brw, &key);
-
-   drm_intel_bo_unreference(brw->cc.state_bo);
-   brw->cc.state_bo = brw_search_cache(&brw->cache, BRW_CC_UNIT,
-				       &key, sizeof(key),
-				       &brw->cc.vp_bo, 1,
-				       NULL);
-
-   if (brw->cc.state_bo == NULL)
-      brw->cc.state_bo = cc_unit_create_from_key(brw, &key);
 }
 
 const struct brw_tracked_state brw_cc_unit = {
    .dirty = {
       .mesa = _NEW_STENCIL | _NEW_COLOR | _NEW_DEPTH,
-      .brw = 0,
+      .brw = BRW_NEW_BATCH,
       .cache = CACHE_NEW_CC_VP
    },
    .prepare = prepare_cc_unit,
+   .emit = upload_cc_unit,
 };
-
-
-
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index e688431..6d064b8 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -34,8 +34,6 @@
 #include "main/api_noop.h"
 #include "main/macros.h"
 #include "main/simple_list.h"
-#include "program/shader_api.h"
-
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_draw.h"
@@ -54,6 +52,9 @@
 
    brwInitFragProgFuncs( functions );
    brw_init_queryobj_functions(functions);
+
+   functions->Enable = brw_enable;
+   functions->DepthRange = brw_depth_range;
 }
 
 GLboolean brwCreateContext( int api,
@@ -187,6 +188,11 @@
 
    brw_draw_init( brw );
 
+   /* Now that most driver functions are hooked up, initialize some of the
+    * immediate state.
+    */
+   brw_update_cc_vp(brw);
+
    return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index d97634c..cc4e663 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -131,6 +131,7 @@
 #define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
 #define BRW_NEW_PSP                     0x800
 #define BRW_NEW_WM_SURFACES		0x1000
+#define BRW_NEW_BINDING_TABLE		0x2000
 #define BRW_NEW_INDICES			0x4000
 #define BRW_NEW_VERTICES		0x8000
 /**
@@ -143,6 +144,8 @@
 #define BRW_NEW_NR_WM_SURFACES		0x40000
 #define BRW_NEW_NR_VS_SURFACES		0x80000
 #define BRW_NEW_INDEX_BUFFER		0x100000
+#define BRW_NEW_VS_CONSTBUF		0x200000
+#define BRW_NEW_WM_CONSTBUF		0x200000
 
 struct brw_state_flags {
    /** State update flags signalled by mesa internals */
@@ -160,7 +163,6 @@
 struct brw_vertex_program {
    struct gl_vertex_program program;
    GLuint id;
-   drm_intel_bo *const_buffer;    /** Program constant buffer/surface */
    GLboolean use_const_buffer;
 };
 
@@ -172,7 +174,6 @@
    GLboolean isGLSL;  /**< really, any IF/LOOP/CONT/BREAK instructions */
 
    GLboolean use_const_buffer;
-   drm_intel_bo *const_buffer;    /** Program constant buffer/surface */
 
    /** for debugging, which texture units are referenced */
    GLbitfield tex_units_used;
@@ -301,8 +302,6 @@
    BRW_CLIP_VP,
    BRW_CLIP_UNIT,
    BRW_CLIP_PROG,
-   BRW_SS_SURFACE,
-   BRW_SS_SURF_BIND,
 
    BRW_MAX_CACHE
 };
@@ -376,8 +375,6 @@
 #define CACHE_NEW_CLIP_VP                (1<<BRW_CLIP_VP)
 #define CACHE_NEW_CLIP_UNIT              (1<<BRW_CLIP_UNIT)
 #define CACHE_NEW_CLIP_PROG              (1<<BRW_CLIP_PROG)
-#define CACHE_NEW_SURFACE                (1<<BRW_SS_SURFACE)
-#define CACHE_NEW_SURF_BIND              (1<<BRW_SS_SURF_BIND)
 
 struct brw_cached_batch_item {
    struct header *header;
@@ -460,12 +457,11 @@
        * consisting of the vertex buffers, pipelined state pointers,
        * the CURBE, the depth buffer, and a query BO.
        */
-      drm_intel_bo *validated_bos[VERT_ATTRIB_MAX + 16];
+      drm_intel_bo *validated_bos[VERT_ATTRIB_MAX + BRW_WM_MAX_SURF + 16];
       int validated_bo_count;
    } state;
 
-   struct brw_cache cache;  /** non-surface items */
-   struct brw_cache surface_cache;  /* surface items */
+   struct brw_cache cache;
    struct brw_cached_batch_item *cached_batch_items;
 
    struct {
@@ -594,10 +590,13 @@
 
       drm_intel_bo *prog_bo;
       drm_intel_bo *state_bo;
+      drm_intel_bo *const_bo;
 
       /** Binding table of pointers to surf_bo entries */
       drm_intel_bo *bind_bo;
+      uint32_t bind_bo_offset;
       drm_intel_bo *surf_bo[BRW_VS_MAX_SURF];
+      uint32_t surf_offset[BRW_VS_MAX_SURF];
       GLuint nr_surfaces;      
    } vs;
 
@@ -649,10 +648,13 @@
 
       /** Binding table of pointers to surf_bo entries */
       drm_intel_bo *bind_bo;
+      uint32_t bind_bo_offset;
       drm_intel_bo *surf_bo[BRW_WM_MAX_SURF];
+      uint32_t surf_offset[BRW_WM_MAX_SURF];
 
       drm_intel_bo *prog_bo;
       drm_intel_bo *state_bo;
+      drm_intel_bo *const_bo;
    } wm;
 
 
@@ -667,6 +669,7 @@
       drm_intel_bo *color_calc_state_bo;
 
       drm_intel_bo *state_bo;
+      uint32_t state_offset;
    } cc;
 
    struct {
@@ -727,6 +730,9 @@
  */
 void brw_upload_urb_fence(struct brw_context *brw);
 
+/* brw_cc.c */
+void brw_update_cc_vp(struct brw_context *brw);
+
 /* brw_curbe.c
  */
 void brw_upload_cs_urb_state(struct brw_context *brw);
@@ -734,6 +740,10 @@
 /* brw_disasm.c */
 int brw_disasm (FILE *file, struct brw_instruction *inst, int gen);
 
+/* brw_state.c */
+void brw_enable(GLcontext * ctx, GLenum cap, GLboolean state);
+void brw_depth_range(GLcontext *ctx, GLclampd nearval, GLclampd farval);
+
 /*======================================================================
  * Inline conversion functions.  These are better-typed than the
  * macros used previously:
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 3d52f6f..8196d8c 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -182,8 +182,6 @@
    GLcontext *ctx = &brw->intel.ctx;
    const struct brw_vertex_program *vp =
       brw_vertex_program_const(brw->vertex_program);
-   const struct brw_fragment_program *fp =
-      brw_fragment_program_const(brw->fragment_program);
    const GLuint sz = brw->curbe.total_size;
    const GLuint bufsz = sz * 16 * sizeof(GLfloat);
    GLfloat *buf;
@@ -200,8 +198,6 @@
    if (brw->curbe.wm_size) {
       GLuint offset = brw->curbe.wm_start * 16;
 
-      _mesa_load_state_parameters(ctx, fp->program.Base.Parameters); 
-
       /* copy float constants */
       for (i = 0; i < brw->wm.prog_data->nr_params; i++) 
 	 buf[offset + i] = *brw->wm.prog_data->param[i];
@@ -244,14 +240,6 @@
       GLuint offset = brw->curbe.vs_start * 16;
       GLuint nr = brw->vs.prog_data->nr_params / 4;
 
-      if (brw->vertex_program->IsNVProgram)
-	 _mesa_load_tracked_matrices(ctx);
-
-      /* Updates the ParamaterValues[i] pointers for all parameters of the
-       * basic type of PROGRAM_STATE_VAR.
-       */
-      _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); 
-
       if (vp->use_const_buffer) {
 	 /* Load the subset of push constants that will get used when
 	  * we also have a pull constant buffer.
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 2d3556b..39bf5b6 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -998,7 +998,7 @@
 # define GEN6_WM_LINE_AA_WIDTH_2_0			(2 << 14)
 # define GEN6_WM_LINE_AA_WIDTH_4_0			(3 << 14)
 # define GEN6_WM_POLYGON_STIPPLE_ENABLE			(1 << 13)
-# define GEN6_WM_LINE_STIPPLE_ENABLE			(1 << 12)
+# define GEN6_WM_LINE_STIPPLE_ENABLE			(1 << 11)
 # define GEN6_WM_OMASK_TO_RENDER_TARGET			(1 << 9)
 # define GEN6_WM_USES_SOURCE_W				(1 << 8)
 # define GEN6_WM_DUAL_SOURCE_BLEND_ENABLE		(1 << 7)
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 3e305c8..16331cc 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -151,9 +151,6 @@
    prim_packet.start_instance_location = 0;
    prim_packet.base_vert_location = prim->basevertex;
 
-   /* Can't wrap here, since we rely on the validated state. */
-   intel->no_batch_wrap = GL_TRUE;
-
    /* If we're set to always flush, do it before and after the primitive emit.
     * We want to catch both missed flushes that hurt instruction/state cache
     * and missed flushes of the render cache as it heads to other parts of
@@ -169,8 +166,6 @@
    if (intel->always_flush_cache) {
       intel_batchbuffer_emit_mi_flush(intel->batch);
    }
-
-   intel->no_batch_wrap = GL_FALSE;
 }
 
 static void brw_merge_inputs( struct brw_context *brw,
@@ -394,11 +389,14 @@
 	    }
 	 }
 
+	 intel->no_batch_wrap = GL_TRUE;
 	 brw_upload_state(brw);
       }
 
       brw_emit_prim(brw, &prim[i], hw_prim);
 
+      intel->no_batch_wrap = GL_FALSE;
+
       retval = GL_TRUE;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 175899b..34dfe10 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -286,6 +286,7 @@
 				    GLuint response_length,
 				    GLboolean end_of_thread)
 {
+	struct intel_context *intel = &brw->intel;
 	brw_set_src1(insn, brw_imm_d(0));
 
 	insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
@@ -298,8 +299,12 @@
 	insn->bits3.urb_gen5.response_length = response_length; /* may be 1 or 0 */
 	insn->bits3.urb_gen5.msg_length = 1;
 	insn->bits3.urb_gen5.end_of_thread = end_of_thread;
-	insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
-	insn->bits2.send_gen5.end_of_thread = end_of_thread;
+	if (intel->gen >= 6) {
+	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
+	} else {
+	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
+	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
+	}
 }
 
 static void brw_set_urb_message( struct brw_context *brw,
@@ -966,10 +971,25 @@
 		  struct brw_reg src,
 		  GLuint precision )
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1; 
    GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1; 
 
+   if (intel->gen >= 6) {
+      insn = next_insn(p, BRW_OPCODE_MATH);
+
+      /* Math is the same ISA format as other opcodes, except that CondModifier
+       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
+       */
+      insn->header.destreg__conditionalmod = function;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src);
+      brw_set_src1(insn, brw_null_reg());
+      return;
+   }
+
    /* First instruction:
     */
    brw_push_insn_state(p);
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 35908ee..572175f 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -96,18 +96,12 @@
    .emit = upload_drawing_rect
 };
 
-static void prepare_binding_table_pointers(struct brw_context *brw)
-{
-   brw_add_validated_bo(brw, brw->vs.bind_bo);
-   brw_add_validated_bo(brw, brw->wm.bind_bo);
-}
-
 /**
  * Upload the binding table pointers, which point each stage's array of surface
  * state pointers.
  *
  * The binding table pointers are relative to the surface state base address,
- * which is 0.
+ * which points at the batchbuffer containing the streamed batch state.
  */
 static void upload_binding_table_pointers(struct brw_context *brw)
 {
@@ -115,24 +109,20 @@
 
    BEGIN_BATCH(6);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
-   if (brw->vs.bind_bo != NULL)
-      OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
-   else
-      OUT_BATCH(0);
+   OUT_BATCH(brw->vs.bind_bo_offset);
    OUT_BATCH(0); /* gs */
    OUT_BATCH(0); /* clip */
    OUT_BATCH(0); /* sf */
-   OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */
+   OUT_BATCH(brw->wm.bind_bo_offset);
    ADVANCE_BATCH();
 }
 
 const struct brw_tracked_state brw_binding_table_pointers = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_BATCH,
-      .cache = CACHE_NEW_SURF_BIND,
+      .brw = BRW_NEW_BATCH | BRW_NEW_BINDING_TABLE,
+      .cache = 0,
    },
-   .prepare = prepare_binding_table_pointers,
    .emit = upload_binding_table_pointers,
 };
 
@@ -141,7 +131,7 @@
  * state pointers.
  *
  * The binding table pointers are relative to the surface state base address,
- * which is 0.
+ * which points at the batchbuffer containing the streamed batch state.
  */
 static void upload_gen6_binding_table_pointers(struct brw_context *brw)
 {
@@ -153,22 +143,18 @@
 	     GEN6_BINDING_TABLE_MODIFY_GS |
 	     GEN6_BINDING_TABLE_MODIFY_PS |
 	     (4 - 2));
-   if (brw->vs.bind_bo != NULL)
-      OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
-   else
-      OUT_BATCH(0);
+   OUT_BATCH(brw->vs.bind_bo_offset); /* vs */
    OUT_BATCH(0); /* gs */
-   OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */
+   OUT_BATCH(brw->wm.bind_bo_offset); /* wm/ps */
    ADVANCE_BATCH();
 }
 
 const struct brw_tracked_state gen6_binding_table_pointers = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_BATCH,
-      .cache = CACHE_NEW_SURF_BIND,
+      .brw = BRW_NEW_BATCH | BRW_NEW_BINDING_TABLE,
+      .cache = 0,
    },
-   .prepare = prepare_binding_table_pointers,
    .emit = upload_gen6_binding_table_pointers,
 };
 
@@ -199,7 +185,8 @@
    OUT_RELOC(brw->clip.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
    OUT_RELOC(brw->sf.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
    OUT_RELOC(brw->wm.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
-   OUT_RELOC(brw->cc.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_RELOC(brw->cc.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+	     brw->cc.state_offset);
    ADVANCE_BATCH();
 
    brw->state.dirty.brw |= BRW_NEW_PSP;
@@ -213,7 +200,6 @@
    brw_add_validated_bo(brw, brw->clip.state_bo);
    brw_add_validated_bo(brw, brw->sf.state_bo);
    brw_add_validated_bo(brw, brw->wm.state_bo);
-   brw_add_validated_bo(brw, brw->cc.state_bo);
 }
 
 static void upload_psp_urb_cbs(struct brw_context *brw )
@@ -590,23 +576,23 @@
 /**
  * Define the base addresses which some state is referenced from.
  *
- * This allows us to avoid having to emit relocations in many places for
- * cached state, and instead emit pointers inside of large, mostly-static
- * state pools.  This comes at the expense of memory, and more expensive cache
- * misses.
+ * This allows us to avoid having to emit relocations for the objects,
+ * and is actually required for binding table pointers on gen6.
+ *
+ * Surface state base address covers binding table pointers and
+ * surface state objects, but not the surfaces that the surface state
+ * objects point to.
  */
 static void upload_state_base_address( struct brw_context *brw )
 {
    struct intel_context *intel = &brw->intel;
 
-   /* Output the structure (brw_state_base_address) directly to the
-    * batchbuffer, so we can emit relocations inline.
-    */
    if (intel->gen >= 6) {
        BEGIN_BATCH(10);
        OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2));
        OUT_BATCH(1); /* General state base address */
-       OUT_BATCH(1); /* Surface state base address */
+       OUT_RELOC(intel->batch->buf, I915_GEM_DOMAIN_SAMPLER, 0,
+		 1); /* Surface state base address */
        OUT_BATCH(1); /* Dynamic state base address */
        OUT_BATCH(1); /* Indirect object base address */
        OUT_BATCH(1); /* Instruction base address */
@@ -619,7 +605,8 @@
        BEGIN_BATCH(8);
        OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
        OUT_BATCH(1); /* General state base address */
-       OUT_BATCH(1); /* Surface state base address */
+       OUT_RELOC(intel->batch->buf, I915_GEM_DOMAIN_SAMPLER, 0,
+		 1); /* Surface state base address */
        OUT_BATCH(1); /* Indirect object base address */
        OUT_BATCH(1); /* Instruction base address */
        OUT_BATCH(1); /* General state upper bound */
@@ -630,7 +617,8 @@
        BEGIN_BATCH(6);
        OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
        OUT_BATCH(1); /* General state base address */
-       OUT_BATCH(1); /* Surface state base address */
+       OUT_RELOC(intel->batch->buf, I915_GEM_DOMAIN_SAMPLER, 0,
+		 1); /* Surface state base address */
        OUT_BATCH(1); /* Indirect object base address */
        OUT_BATCH(1); /* General state upper bound */
        OUT_BATCH(1); /* Indirect object upper bound */
@@ -641,7 +629,7 @@
 const struct brw_tracked_state brw_state_base_address = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_CONTEXT,
+      .brw = BRW_NEW_BATCH,
       .cache = 0,
    },
    .emit = upload_state_base_address
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index cc9ac6d..aeed24d 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -31,10 +31,10 @@
   
 #include "main/imports.h"
 #include "main/enums.h"
+#include "main/shaderobj.h"
 #include "program/prog_parameter.h"
 #include "program/program.h"
 #include "program/programopt.h"
-#include "program/shader_api.h"
 #include "tnl/tnl.h"
 
 #include "brw_context.h"
@@ -95,20 +95,6 @@
 static void brwDeleteProgram( GLcontext *ctx,
 			      struct gl_program *prog )
 {
-   if (prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
-      struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
-      struct brw_fragment_program *brw_fp = brw_fragment_program(fp);
-
-      drm_intel_bo_unreference(brw_fp->const_buffer);
-   }
-
-   if (prog->Target == GL_VERTEX_PROGRAM_ARB) {
-      struct gl_vertex_program *vp = (struct gl_vertex_program *) prog;
-      struct brw_vertex_program *brw_vp = brw_vertex_program(vp);
-
-      drm_intel_bo_unreference(brw_vp->const_buffer);
-   }
-
    _mesa_delete_program( ctx, prog );
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index a95acb4..e290ca9 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -105,8 +105,7 @@
    }
 
    drm_intel_bo_unreference(brw->sf.vp_bo);
-   brw->sf.vp_bo = brw_cache_data(&brw->cache, BRW_SF_VP, &sfv, sizeof(sfv),
-				  NULL, 0);
+   brw->sf.vp_bo = brw_cache_data(&brw->cache, BRW_SF_VP, &sfv, sizeof(sfv));
 }
 
 const struct brw_tracked_state brw_sf_vp = {
diff --git a/src/mesa/drivers/dri/i965/brw_state.c b/src/mesa/drivers/dri/i965/brw_state.c
new file mode 100644
index 0000000..1e77e42
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_state.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_context.h"
+
+void
+brw_enable(GLcontext *ctx, GLenum cap, GLboolean state)
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   switch (cap) {
+   case GL_DEPTH_CLAMP:
+      brw_update_cc_vp(brw);
+      break;
+   }
+}
+
+void
+brw_depth_range(GLcontext *ctx, GLclampd nearval, GLclampd farval)
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   if (ctx->Transform.DepthClamp)
+      brw_update_cc_vp(brw);
+}
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 8594921..40eece2 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -48,10 +48,11 @@
 
 const struct brw_tracked_state brw_blend_constant_color;
 const struct brw_tracked_state brw_cc_unit;
-const struct brw_tracked_state brw_cc_vp;
 const struct brw_tracked_state brw_check_fallback;
 const struct brw_tracked_state brw_clip_prog;
 const struct brw_tracked_state brw_clip_unit;
+const struct brw_tracked_state brw_vs_constants;
+const struct brw_tracked_state brw_wm_constants;
 const struct brw_tracked_state brw_constant_buffer;
 const struct brw_tracked_state brw_curbe_offsets;
 const struct brw_tracked_state brw_invarient_state;
@@ -80,6 +81,7 @@
 const struct brw_tracked_state brw_wm_samplers;
 const struct brw_tracked_state brw_wm_constant_surface;
 const struct brw_tracked_state brw_wm_surfaces;
+const struct brw_tracked_state brw_wm_binding_table;
 const struct brw_tracked_state brw_wm_unit;
 
 const struct brw_tracked_state brw_psp_urb_cbs;
@@ -93,7 +95,6 @@
 const struct brw_tracked_state gen6_binding_table_pointers;
 const struct brw_tracked_state gen6_blend_state;
 const struct brw_tracked_state gen6_cc_state_pointers;
-const struct brw_tracked_state gen6_cc_vp;
 const struct brw_tracked_state gen6_clip_state;
 const struct brw_tracked_state gen6_clip_vp;
 const struct brw_tracked_state gen6_color_calc_state;
@@ -108,20 +109,6 @@
 const struct brw_tracked_state gen6_vs_state;
 const struct brw_tracked_state gen6_wm_state;
 
-/**
- * Use same key for WM and VS surfaces.
- */
-struct brw_surface_key {
-   GLenum target, depthmode;
-   drm_intel_bo *bo;
-   GLint format, internal_format;
-   GLint first_level, last_level;
-   GLint width, height, depth;
-   GLint pitch, cpp;
-   uint32_t tiling;
-   GLuint offset;
-};
-
 /***********************************************************************
  * brw_state.c
  */
@@ -137,9 +124,7 @@
 drm_intel_bo *brw_cache_data(struct brw_cache *cache,
 		       enum brw_cache_id cache_id,
 		       const void *data,
-		       GLuint size,
-		       drm_intel_bo **reloc_bufs,
-		       GLuint nr_reloc_bufs);
+		       GLuint size);
 
 drm_intel_bo *brw_upload_cache(struct brw_cache *cache,
 			       enum brw_cache_id cache_id,
@@ -173,7 +158,6 @@
 
 void brw_init_caches( struct brw_context *brw );
 void brw_destroy_caches( struct brw_context *brw );
-void brw_state_cache_bo_delete(struct brw_cache *cache, drm_intel_bo *bo);
 
 /***********************************************************************
  * brw_state_batch.c
@@ -186,10 +170,17 @@
 				   GLuint sz );
 void brw_destroy_batch_cache( struct brw_context *brw );
 void brw_clear_batch_cache( struct brw_context *brw );
+void *brw_state_batch(struct brw_context *brw,
+		      int size,
+		      int alignment,
+		      drm_intel_bo **out_bo,
+		      uint32_t *out_offset);
 
 /* brw_wm_surface_state.c */
-drm_intel_bo *
-brw_create_constant_surface( struct brw_context *brw,
-                             struct brw_surface_key *key );
+void brw_create_constant_surface(struct brw_context *brw,
+				 drm_intel_bo *bo,
+				 int width,
+				 drm_intel_bo **out_bo,
+				 uint32_t *out_offset);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index 3901941..be3989e 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -97,3 +97,52 @@
 {
    brw_clear_batch_cache(brw);
 }
+
+/**
+ * Allocates a block of space in the batchbuffer for indirect state.
+ *
+ * We don't want to allocate separate BOs for every bit of indirect
+ * state in the driver.  It means overallocating by a significant
+ * margin (4096 bytes, even if the object is just a 20-byte surface
+ * state), and more buffers to walk and count for aperture size checking.
+ *
+ * However, due to the restrictions inposed by the aperture size
+ * checking performance hacks, we can't have the batch point at a
+ * separate indirect state buffer, because once the batch points at
+ * it, no more relocations can be added to it.  So, we sneak these
+ * buffers in at the top of the batchbuffer.
+ */
+void *
+brw_state_batch(struct brw_context *brw,
+		int size,
+		int alignment,
+		drm_intel_bo **out_bo,
+		uint32_t *out_offset)
+{
+   struct intel_batchbuffer *batch = brw->intel.batch;
+   uint32_t offset;
+
+   assert(size < batch->buf->size);
+   offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
+
+   /* If allocating from the top would wrap below the batchbuffer, or
+    * if the batch's used space (plus the reserved pad) collides with our
+    * space, then flush and try again.
+    */
+   if (batch->state_batch_offset < size ||
+       offset < batch->ptr - batch->map + batch->reserved_space) {
+      intel_batchbuffer_flush(batch);
+      offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
+   }
+
+   batch->state_batch_offset = offset;
+
+   if (*out_bo != batch->buf) {
+      drm_intel_bo_unreference(*out_bo);
+      drm_intel_bo_reference(batch->buf);
+      *out_bo = batch->buf;
+   }
+
+   *out_offset = offset;
+   return batch->map + offset;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index ea81ad1..b31d849 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -310,9 +310,7 @@
 brw_cache_data(struct brw_cache *cache,
 	       enum brw_cache_id cache_id,
 	       const void *data,
-	       GLuint data_size,
-	       drm_intel_bo **reloc_bufs,
-	       GLuint nr_reloc_bufs)
+	       GLuint data_size)
 {
    drm_intel_bo *bo;
    struct brw_cache_item *item, lookup;
@@ -321,8 +319,8 @@
    lookup.cache_id = cache_id;
    lookup.key = data;
    lookup.key_size = data_size;
-   lookup.reloc_bufs = reloc_bufs;
-   lookup.nr_reloc_bufs = nr_reloc_bufs;
+   lookup.reloc_bufs = NULL;
+   lookup.nr_reloc_bufs = 0;
    hash = hash_key(&lookup);
    lookup.hash = hash;
 
@@ -335,7 +333,7 @@
 
    bo = brw_upload_cache(cache, cache_id,
 			 data, data_size,
-			 reloc_bufs, nr_reloc_bufs,
+			 NULL, 0,
 			 data, data_size);
 
    return bo;
@@ -396,29 +394,10 @@
    brw_init_cache_id(cache, "DEPTH_STENCIL_STATE", BRW_DEPTH_STENCIL_STATE);
 }
 
-
-static void
-brw_init_surface_cache(struct brw_context *brw)
-{
-   struct brw_cache *cache = &brw->surface_cache;
-
-   cache->brw = brw;
-
-   cache->size = 7;
-   cache->n_items = 0;
-   cache->items = (struct brw_cache_item **)
-      calloc(1, cache->size * sizeof(struct brw_cache_item));
-
-   brw_init_cache_id(cache, "SS_SURFACE", BRW_SS_SURFACE);
-   brw_init_cache_id(cache, "SS_SURF_BIND", BRW_SS_SURF_BIND);
-}
-
-
 void
 brw_init_caches(struct brw_context *brw)
 {
    brw_init_non_surface_cache(brw);
-   brw_init_surface_cache(brw);
 }
 
 
@@ -452,56 +431,17 @@
    brw->state.dirty.cache |= ~0;
 }
 
-/* Clear all entries from the cache that point to the given bo.
- *
- * This lets us release memory for reuse earlier for known-dead buffers,
- * at the cost of walking the entire hash table.
- */
-void
-brw_state_cache_bo_delete(struct brw_cache *cache, drm_intel_bo *bo)
-{
-   struct brw_cache_item **prev;
-   GLuint i;
-
-   if (INTEL_DEBUG & DEBUG_STATE)
-      printf("%s\n", __FUNCTION__);
-
-   for (i = 0; i < cache->size; i++) {
-      for (prev = &cache->items[i]; *prev;) {
-	 struct brw_cache_item *c = *prev;
-
-	 if (drm_intel_bo_references(c->bo, bo)) {
-	    int j;
-
-	    *prev = c->next;
-
-	    for (j = 0; j < c->nr_reloc_bufs; j++)
-	       drm_intel_bo_unreference(c->reloc_bufs[j]);
-	    drm_intel_bo_unreference(c->bo);
-	    free((void *)c->key);
-	    free(c);
-	    cache->n_items--;
-	 } else {
-	    prev = &c->next;
-	 }
-      }
-   }
-}
-
 void
 brw_state_cache_check_size(struct brw_context *brw)
 {
    if (INTEL_DEBUG & DEBUG_STATE)
       printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
 
-   /* un-tuned guess.  We've got around 20 state objects for a total of around
-    * 32k, so 1000 of them is around 1.5MB.
+   /* un-tuned guess.  Each object is generally a page, so 1000 of them is 4 MB of
+    * state cache.
     */
    if (brw->cache.n_items > 1000)
       brw_clear_cache(brw, &brw->cache);
-
-   if (brw->surface_cache.n_items > 1000)
-      brw_clear_cache(brw, &brw->surface_cache);
 }
 
 
@@ -528,5 +468,4 @@
 brw_destroy_caches(struct brw_context *brw)
 {
    brw_destroy_cache(brw, &brw->cache);
-   brw_destroy_cache(brw, &brw->surface_cache);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index cb66806..d410861 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -111,8 +111,8 @@
 	 continue;
       }
       drm_intel_bo_map(surf_bo, GL_FALSE);
-      surfoff = surf_bo->offset;
-      surf = (struct brw_surface_state *)(surf_bo->virtual);
+      surfoff = surf_bo->offset + brw->wm.surf_offset[i];
+      surf = (struct brw_surface_state *)(surf_bo->virtual + brw->wm.surf_offset[i]);
 
       sprintf(name, "WM SS%d", i);
       state_out(name, surf, surfoff, 0, "%s %s\n",
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 49629ba..f92a19c 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -61,12 +61,15 @@
    &brw_curbe_offsets,
    &brw_recalculate_urb_fence,
 
-   &brw_cc_vp,
    &brw_cc_unit,
 
+   &brw_vs_constants, /* Before vs_surfaces and constant_buffer */
+   &brw_wm_constants, /* Before wm_surfaces and constant_buffer */
+
    &brw_vs_surfaces,		/* must do before unit */
    &brw_wm_constant_surface,	/* must do before wm surfaces/bind bo */
    &brw_wm_surfaces,		/* must do before samplers and unit */
+   &brw_wm_binding_table,
    &brw_wm_samplers,
 
    &brw_wm_unit,
@@ -113,7 +116,6 @@
 
    &gen6_clip_vp,
    &gen6_sf_vp,
-   &gen6_cc_vp,
 
    /* Command packets: */
    &brw_invarient_state,
@@ -126,9 +128,13 @@
    &gen6_depth_stencil_state,	/* must do before cc unit */
    &gen6_cc_state_pointers,
 
+   &brw_vs_constants, /* Before vs_surfaces and constant_buffer */
+   &brw_wm_constants, /* Before wm_surfaces and constant_buffer */
+
    &brw_vs_surfaces,		/* must do before unit */
    &brw_wm_constant_surface,	/* must do before wm surfaces/bind bo */
    &brw_wm_surfaces,		/* must do before samplers and unit */
+   &brw_wm_binding_table,
 
    &brw_wm_samplers,
    &gen6_sampler_state,
@@ -266,6 +272,8 @@
    DEFINE_BIT(BRW_NEW_CONTEXT),
    DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS),
    DEFINE_BIT(BRW_NEW_PSP),
+   DEFINE_BIT(BRW_NEW_WM_SURFACES),
+   DEFINE_BIT(BRW_NEW_BINDING_TABLE),
    DEFINE_BIT(BRW_NEW_INDICES),
    DEFINE_BIT(BRW_NEW_INDEX_BUFFER),
    DEFINE_BIT(BRW_NEW_VERTICES),
@@ -292,8 +300,6 @@
    DEFINE_BIT(CACHE_NEW_CLIP_VP),
    DEFINE_BIT(CACHE_NEW_CLIP_UNIT),
    DEFINE_BIT(CACHE_NEW_CLIP_PROG),
-   DEFINE_BIT(CACHE_NEW_SURFACE),
-   DEFINE_BIT(CACHE_NEW_SURF_BIND),
    {0, 0, 0}
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index f17fe48..2a7fa5b 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -278,7 +278,7 @@
    struct header header;
 
    struct {
-      GLuint aa_coverage_scope:8;
+      GLuint aa_coverage_slope:8;
       GLuint pad0:8;
       GLuint aa_coverage_bias:8;
       GLuint pad1:8;
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index 568c2e3..0250a68 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -42,42 +42,59 @@
  * Otherwise, constants go through the CURBEs using the brw_constant_buffer
  * state atom.
  */
-static drm_intel_bo *
-brw_vs_update_constant_buffer(struct brw_context *brw)
+static void
+prepare_vs_constants(struct brw_context *brw)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = &brw->intel;
    struct brw_vertex_program *vp =
       (struct brw_vertex_program *) brw->vertex_program;
    const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
    const int size = params->NumParameters * 4 * sizeof(GLfloat);
-   drm_intel_bo *const_buffer;
    int i;
 
-   /* BRW_NEW_VERTEX_PROGRAM */
-   if (!vp->use_const_buffer)
-      return NULL;
-
-   const_buffer = drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer",
-				     size, 64);
-
-   /* _NEW_PROGRAM_CONSTANTS */
+   if (vp->program.IsNVProgram)
+      _mesa_load_tracked_matrices(ctx);
 
    /* Updates the ParamaterValues[i] pointers for all parameters of the
     * basic type of PROGRAM_STATE_VAR.
     */
    _mesa_load_state_parameters(&brw->intel.ctx, vp->program.Base.Parameters);
 
-   drm_intel_gem_bo_map_gtt(const_buffer);
+   /* BRW_NEW_VERTEX_PROGRAM */
+   if (!vp->use_const_buffer) {
+      if (brw->vs.const_bo) {
+	 drm_intel_bo_unreference(brw->vs.const_bo);
+	 brw->vs.const_bo = NULL;
+	 brw->state.dirty.brw |= BRW_NEW_VS_CONSTBUF;
+      }
+      return;
+   }
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   drm_intel_bo_unreference(brw->vs.const_bo);
+   brw->vs.const_bo = drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer",
+					 size, 64);
+
+   drm_intel_gem_bo_map_gtt(brw->vs.const_bo);
    for (i = 0; i < params->NumParameters; i++) {
-      memcpy(const_buffer->virtual + i * 4 * sizeof(float),
+      memcpy(brw->vs.const_bo->virtual + i * 4 * sizeof(float),
 	     params->ParameterValues[i],
 	     4 * sizeof(float));
    }
-   drm_intel_gem_bo_unmap_gtt(const_buffer);
-
-   return const_buffer;
+   drm_intel_gem_bo_unmap_gtt(brw->vs.const_bo);
+   brw->state.dirty.brw |= BRW_NEW_VS_CONSTBUF;
 }
 
+const struct brw_tracked_state brw_vs_constants = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_VERTEX_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_vs_constants,
+};
+
 /**
  * Update the surface state for a VS constant buffer.
  *
@@ -88,101 +105,41 @@
                                 GLuint surf)
 {
    struct brw_context *brw = brw_context(ctx);
-   struct brw_surface_key key;
    struct brw_vertex_program *vp =
       (struct brw_vertex_program *) brw->vertex_program;
    const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
 
    assert(surf == 0);
 
-   /* If we're in this state update atom, we need to update VS constants, so
-    * free the old buffer and create a new one for the new contents.
-    */
-   drm_intel_bo_unreference(vp->const_buffer);
-   vp->const_buffer = brw_vs_update_constant_buffer(brw);
-
    /* If there's no constant buffer, then no surface BO is needed to point at
     * it.
     */
-   if (vp->const_buffer == NULL) {
+   if (brw->vs.const_bo == NULL) {
       drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
       brw->vs.surf_bo[surf] = NULL;
       return;
    }
 
-   memset(&key, 0, sizeof(key));
-
-   key.format = MESA_FORMAT_RGBA_FLOAT32;
-   key.internal_format = GL_RGBA;
-   key.bo = vp->const_buffer;
-   key.depthmode = GL_NONE;
-   key.pitch = params->NumParameters;
-   key.width = params->NumParameters;
-   key.height = 1;
-   key.depth = 1;
-   key.cpp = 16;
-
-   /*
-   printf("%s:\n", __FUNCTION__);
-   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
-          key.width, key.height, key.depth, key.cpp, key.pitch);
-   */
-
-   drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
-   brw->vs.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
-                                            BRW_SS_SURFACE,
-                                            &key, sizeof(key),
-                                            &key.bo, 1,
-                                            NULL);
-   if (brw->vs.surf_bo[surf] == NULL) {
-      brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key);
-   }
+   brw_create_constant_surface(brw, brw->vs.const_bo, params->NumParameters,
+			       &brw->vs.surf_bo[surf],
+			       &brw->vs.surf_offset[surf]);
 }
 
 
-/**
- * Constructs the binding table for the VS surface state.
- */
-static drm_intel_bo *
-brw_vs_get_binding_table(struct brw_context *brw)
+static void
+prepare_vs_surfaces(struct brw_context *brw)
 {
-   drm_intel_bo *bind_bo;
+   int nr_surfaces = 0;
 
-   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
-			      NULL, 0,
-			      brw->vs.surf_bo, BRW_VS_MAX_SURF,
-			      NULL);
-
-   if (bind_bo == NULL) {
-      GLuint data_size = BRW_VS_MAX_SURF * sizeof(GLuint);
-      uint32_t data[BRW_VS_MAX_SURF];
-      int i;
-
-      for (i = 0; i < BRW_VS_MAX_SURF; i++)
-         if (brw->vs.surf_bo[i])
-            data[i] = brw->vs.surf_bo[i]->offset;
-         else
-            data[i] = 0;
-
-      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
-				  NULL, 0,
-				  brw->vs.surf_bo, BRW_VS_MAX_SURF,
-				  data, data_size);
-
-      /* Emit binding table relocations to surface state */
-      for (i = 0; i < BRW_VS_MAX_SURF; i++) {
-	 if (brw->vs.surf_bo[i] != NULL) {
-	    /* The presumed offsets were set in the data values for
-	     * brw_upload_cache.
-	     */
-	    drm_intel_bo_emit_reloc(bind_bo, i * 4,
-				    brw->vs.surf_bo[i], 0,
-				    I915_GEM_DOMAIN_INSTRUCTION, 0);
-	 }
-      }
+   if (brw->vs.const_bo) {
+      brw_add_validated_bo(brw, brw->vs.const_bo);
+      nr_surfaces = 1;
    }
 
-   return bind_bo;
+   if (brw->vs.nr_surfaces != nr_surfaces) {
+      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
+      brw->vs.nr_surfaces = nr_surfaces;
+   }
 }
 
 /**
@@ -192,43 +149,50 @@
  * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and
  * CACHE_NEW_SURF_BIND for the binding table upload.
  */
-static void prepare_vs_surfaces(struct brw_context *brw )
+static void upload_vs_surfaces(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
+   uint32_t *bind;
    int i;
-   int nr_surfaces = 0;
+
+   /* BRW_NEW_NR_VS_SURFACES */
+   if (brw->vs.nr_surfaces == 0) {
+      if (brw->vs.bind_bo) {
+	 drm_intel_bo_unreference(brw->vs.bind_bo);
+	 brw->vs.bind_bo = NULL;
+	 brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE;
+      }
+      return;
+   }
 
    brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER);
 
+   /* Might want to calculate nr_surfaces first, to avoid taking up so much
+    * space for the binding table. (once we have vs samplers)
+    */
+   bind = brw_state_batch(brw, sizeof(uint32_t) * BRW_VS_MAX_SURF,
+			  32, &brw->vs.bind_bo, &brw->vs.bind_bo_offset);
+
    for (i = 0; i < BRW_VS_MAX_SURF; i++) {
-      if (brw->vs.surf_bo[i] != NULL) {
-	 nr_surfaces = i + 1;
+      /* BRW_NEW_VS_CONSTBUF */
+      if (brw->vs.surf_bo[i]) {
+	 bind[i] = brw->vs.surf_offset[i];
+      } else {
+	 bind[i] = 0;
       }
    }
 
-   if (brw->vs.nr_surfaces != nr_surfaces) {
-      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
-      brw->vs.nr_surfaces = nr_surfaces;
-   }
-
-   /* Note that we don't end up updating the bind_bo if we don't have a
-    * surface to be pointing at.  This should be relatively harmless, as it
-    * just slightly increases our working set size.
-    */
-   if (brw->vs.nr_surfaces != 0) {
-      drm_intel_bo_unreference(brw->vs.bind_bo);
-      brw->vs.bind_bo = brw_vs_get_binding_table(brw);
-   }
+   brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE;
 }
 
 const struct brw_tracked_state brw_vs_surfaces = {
    .dirty = {
-      .mesa = (_NEW_PROGRAM_CONSTANTS),
-      .brw = (BRW_NEW_VERTEX_PROGRAM),
+      .mesa = 0,
+      .brw = (BRW_NEW_VS_CONSTBUF |
+	      BRW_NEW_NR_VS_SURFACES |
+	      BRW_NEW_BATCH),
       .cache = 0
    },
    .prepare = prepare_vs_surfaces,
+   .emit = upload_vs_surfaces,
 };
-
-
-
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index a02e958..14227a5 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -83,6 +83,7 @@
    dri_bo_release(&brw->vs.prog_bo);
    dri_bo_release(&brw->vs.state_bo);
    dri_bo_release(&brw->vs.bind_bo);
+   dri_bo_release(&brw->vs.const_bo);
    dri_bo_release(&brw->gs.prog_bo);
    dri_bo_release(&brw->gs.state_bo);
    dri_bo_release(&brw->clip.prog_bo);
@@ -99,6 +100,7 @@
    dri_bo_release(&brw->wm.sampler_bo);
    dri_bo_release(&brw->wm.prog_bo);
    dri_bo_release(&brw->wm.state_bo);
+   dri_bo_release(&brw->wm.const_bo);
    dri_bo_release(&brw->cc.prog_bo);
    dri_bo_release(&brw->cc.state_bo);
    dri_bo_release(&brw->cc.vp_bo);
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index 9fbabdc..1fc802c 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -74,7 +74,7 @@
    COPY_4V(sdc.color, color); 
    
    return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
-			 &sdc, sizeof(sdc), NULL, 0);
+			 &sdc, sizeof(sdc));
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index c7b6124..17b016b 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -196,36 +196,40 @@
    }
 }
 
-static drm_intel_bo *
-brw_create_texture_surface( struct brw_context *brw,
-			    struct brw_surface_key *key )
+static void
+brw_update_texture_surface( GLcontext *ctx, GLuint unit )
 {
+   struct brw_context *brw = brw_context(ctx);
+   struct gl_texture_object *tObj = ctx->Texture.Unit[unit]._Current;
+   struct intel_texture_object *intelObj = intel_texture_object(tObj);
+   struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel];
+   const GLuint surf_index = SURF_INDEX_TEXTURE(unit);
    struct brw_surface_state surf;
-   drm_intel_bo *bo;
+   void *map;
 
    memset(&surf, 0, sizeof(surf));
 
    surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
-   surf.ss0.surface_type = translate_tex_target(key->target);
-   surf.ss0.surface_format = translate_tex_format(key->format,
-						  key->internal_format,
-						  key->depthmode);
+   surf.ss0.surface_type = translate_tex_target(tObj->Target);
+   surf.ss0.surface_format = translate_tex_format(firstImage->TexFormat,
+						  firstImage->InternalFormat,
+						  tObj->DepthMode);
 
    /* This is ok for all textures with channel width 8bit or less:
     */
 /*    surf.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
-   surf.ss1.base_addr = key->bo->offset; /* reloc */
+   surf.ss1.base_addr = intelObj->mt->region->buffer->offset; /* reloc */
 
-   surf.ss2.mip_count = key->last_level - key->first_level;
-   surf.ss2.width = key->width - 1;
-   surf.ss2.height = key->height - 1;
-   brw_set_surface_tiling(&surf, key->tiling);
-   surf.ss3.pitch = (key->pitch * key->cpp) - 1;
-   surf.ss3.depth = key->depth - 1;
+   surf.ss2.mip_count = intelObj->lastLevel - intelObj->firstLevel;
+   surf.ss2.width = firstImage->Width - 1;
+   surf.ss2.height = firstImage->Height - 1;
+   brw_set_surface_tiling(&surf, intelObj->mt->region->tiling);
+   surf.ss3.pitch = (intelObj->mt->region->pitch * intelObj->mt->cpp) - 1;
+   surf.ss3.depth = firstImage->Depth - 1;
 
    surf.ss4.min_lod = 0;
  
-   if (key->target == GL_TEXTURE_CUBE_MAP) {
+   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
       surf.ss0.cube_pos_x = 1;
       surf.ss0.cube_pos_y = 1;
       surf.ss0.cube_pos_z = 1;
@@ -234,71 +238,33 @@
       surf.ss0.cube_neg_z = 1;
    }
 
-   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
-			 key, sizeof(*key),
-			 &key->bo, 1,
-			 &surf, sizeof(surf));
+   map = brw_state_batch(brw, sizeof(surf), 32,
+			 &brw->wm.surf_bo[surf_index],
+			 &brw->wm.surf_offset[surf_index]);
+   memcpy(map, &surf, sizeof(surf));
 
    /* Emit relocation to surface contents */
-   drm_intel_bo_emit_reloc(bo, offsetof(struct brw_surface_state, ss1),
-			   key->bo, 0,
+   drm_intel_bo_emit_reloc(brw->wm.surf_bo[surf_index],
+			   brw->wm.surf_offset[surf_index] +
+			   offsetof(struct brw_surface_state, ss1),
+			   intelObj->mt->region->buffer, 0,
 			   I915_GEM_DOMAIN_SAMPLER, 0);
-
-   return bo;
 }
 
-static void
-brw_update_texture_surface( GLcontext *ctx, GLuint unit )
-{
-   struct brw_context *brw = brw_context(ctx);
-   struct gl_texture_object *tObj = ctx->Texture.Unit[unit]._Current;
-   struct intel_texture_object *intelObj = intel_texture_object(tObj);
-   struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel];
-   struct brw_surface_key key;
-   const GLuint surf = SURF_INDEX_TEXTURE(unit);
-
-   memset(&key, 0, sizeof(key));
-
-   key.format = firstImage->TexFormat;
-   key.internal_format = firstImage->InternalFormat;
-   key.pitch = intelObj->mt->region->pitch;
-   key.depth = firstImage->Depth;
-   key.bo = intelObj->mt->region->buffer;
-   key.offset = 0;
-
-   key.target = tObj->Target;
-   key.depthmode = tObj->DepthMode;
-   key.first_level = intelObj->firstLevel;
-   key.last_level = intelObj->lastLevel;
-   key.width = firstImage->Width;
-   key.height = firstImage->Height;
-   key.cpp = intelObj->mt->cpp;
-   key.tiling = intelObj->mt->region->tiling;
-
-   drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
-   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
-                                            BRW_SS_SURFACE,
-                                            &key, sizeof(key),
-                                            &key.bo, 1,
-                                            NULL);
-   if (brw->wm.surf_bo[surf] == NULL) {
-      brw->wm.surf_bo[surf] = brw_create_texture_surface(brw, &key);
-   }
-}
-
-
-
 /**
  * Create the constant buffer surface.  Vertex/fragment shader constants will be
  * read from this buffer with Data Port Read instructions/messages.
  */
-drm_intel_bo *
-brw_create_constant_surface( struct brw_context *brw,
-                             struct brw_surface_key *key )
+void
+brw_create_constant_surface(struct brw_context *brw,
+			    drm_intel_bo *bo,
+			    int width,
+			    drm_intel_bo **out_bo,
+			    uint32_t *out_offset)
 {
-   const GLint w = key->width - 1;
+   const GLint w = width - 1;
    struct brw_surface_state surf;
-   drm_intel_bo *bo;
+   void *map;
 
    memset(&surf, 0, sizeof(surf));
 
@@ -306,29 +272,26 @@
    surf.ss0.surface_type = BRW_SURFACE_BUFFER;
    surf.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
 
-   assert(key->bo);
-   surf.ss1.base_addr = key->bo->offset; /* reloc */
+   assert(bo);
+   surf.ss1.base_addr = bo->offset; /* reloc */
 
    surf.ss2.width = w & 0x7f;            /* bits 6:0 of size or width */
    surf.ss2.height = (w >> 7) & 0x1fff;  /* bits 19:7 of size or width */
    surf.ss3.depth = (w >> 20) & 0x7f;    /* bits 26:20 of size or width */
-   surf.ss3.pitch = (key->pitch * key->cpp) - 1; /* ignored?? */
-   brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */
- 
-   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
-			 key, sizeof(*key),
-			 &key->bo, 1,
-			 &surf, sizeof(surf));
+   surf.ss3.pitch = (width * 16) - 1; /* ignored?? */
+   brw_set_surface_tiling(&surf, I915_TILING_NONE); /* tiling now allowed */
+
+   map = brw_state_batch(brw, sizeof(surf), 32, out_bo, out_offset);
+   memcpy(map, &surf, sizeof(surf));
 
    /* Emit relocation to surface contents.  Section 5.1.1 of the gen4
     * bspec ("Data Cache") says that the data cache does not exist as
     * a separate cache and is just the sampler cache.
     */
-   drm_intel_bo_emit_reloc(bo, offsetof(struct brw_surface_state, ss1),
-			   key->bo, 0,
+   drm_intel_bo_emit_reloc(*out_bo, (*out_offset +
+				     offsetof(struct brw_surface_state, ss1)),
+			   bo, 0,
 			   I915_GEM_DOMAIN_SAMPLER, 0);
-
-   return bo;
 }
 
 /* Creates a new WM constant buffer reflecting the current fragment program's
@@ -337,89 +300,45 @@
  * Otherwise, constants go through the CURBEs using the brw_constant_buffer
  * state atom.
  */
-static drm_intel_bo *
-brw_wm_update_constant_buffer(struct brw_context *brw)
+static void
+prepare_wm_constants(struct brw_context *brw)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = &brw->intel;
    struct brw_fragment_program *fp =
       (struct brw_fragment_program *) brw->fragment_program;
    const struct gl_program_parameter_list *params = fp->program.Base.Parameters;
    const int size = params->NumParameters * 4 * sizeof(GLfloat);
-   drm_intel_bo *const_buffer;
+
+   _mesa_load_state_parameters(ctx, fp->program.Base.Parameters);
 
    /* BRW_NEW_FRAGMENT_PROGRAM */
-   if (!fp->use_const_buffer)
-      return NULL;
-
-   const_buffer = drm_intel_bo_alloc(intel->bufmgr, "fp_const_buffer",
-				     size, 64);
-
-   /* _NEW_PROGRAM_CONSTANTS */
-   drm_intel_bo_subdata(const_buffer, 0, size, params->ParameterValues);
-
-   return const_buffer;
-}
-
-/**
- * Update the surface state for a WM constant buffer.
- * The constant buffer will be (re)allocated here if needed.
- */
-static void
-brw_update_wm_constant_surface( GLcontext *ctx,
-                                GLuint surf)
-{
-   struct brw_context *brw = brw_context(ctx);
-   struct brw_surface_key key;
-   struct brw_fragment_program *fp =
-      (struct brw_fragment_program *) brw->fragment_program;
-   const struct gl_program_parameter_list *params =
-      fp->program.Base.Parameters;
-
-   /* If we're in this state update atom, we need to update WM constants, so
-    * free the old buffer and create a new one for the new contents.
-    */
-   drm_intel_bo_unreference(fp->const_buffer);
-   fp->const_buffer = brw_wm_update_constant_buffer(brw);
-
-   /* If there's no constant buffer, then no surface BO is needed to point at
-    * it.
-    */
-   if (fp->const_buffer == NULL) {
-      drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
-      brw->wm.surf_bo[surf] = NULL;
+   if (!fp->use_const_buffer) {
+      if (brw->wm.const_bo) {
+	 drm_intel_bo_unreference(brw->wm.const_bo);
+	 brw->wm.const_bo = NULL;
+	 brw->state.dirty.brw |= BRW_NEW_WM_CONSTBUF;
+      }
       return;
    }
 
-   memset(&key, 0, sizeof(key));
+   drm_intel_bo_unreference(brw->wm.const_bo);
+   brw->wm.const_bo = drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer",
+					 size, 64);
 
-   key.format = MESA_FORMAT_RGBA_FLOAT32;
-   key.internal_format = GL_RGBA;
-   key.bo = fp->const_buffer;
-   key.depthmode = GL_NONE;
-   key.pitch = params->NumParameters;
-   key.width = params->NumParameters;
-   key.height = 1;
-   key.depth = 1;
-   key.cpp = 16;
-
-   /*
-   printf("%s:\n", __FUNCTION__);
-   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
-          key.width, key.height, key.depth, key.cpp, key.pitch);
-   */
-
-   drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
-   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
-                                            BRW_SS_SURFACE,
-                                            &key, sizeof(key),
-                                            &key.bo, 1,
-                                            NULL);
-   if (brw->wm.surf_bo[surf] == NULL) {
-      brw->wm.surf_bo[surf] = brw_create_constant_surface(brw, &key);
-   }
-   brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
+   /* _NEW_PROGRAM_CONSTANTS */
+   drm_intel_bo_subdata(brw->wm.const_bo, 0, size, params->ParameterValues);
 }
 
+const struct brw_tracked_state brw_wm_constants = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_FRAGMENT_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_wm_constants,
+};
+
 /**
  * Updates surface / buffer for fragment shader constant buffer, if
  * one is required.
@@ -428,20 +347,18 @@
  * BRW_NEW_WM_SURFACES to get picked up by brw_prepare_wm_surfaces for
  * inclusion in the binding table.
  */
-static void prepare_wm_constant_surface(struct brw_context *brw )
+static void upload_wm_constant_surface(struct brw_context *brw )
 {
-   GLcontext *ctx = &brw->intel.ctx;
+   GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
    struct brw_fragment_program *fp =
       (struct brw_fragment_program *) brw->fragment_program;
-   GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
-
-   drm_intel_bo_unreference(fp->const_buffer);
-   fp->const_buffer = brw_wm_update_constant_buffer(brw);
+   const struct gl_program_parameter_list *params =
+      fp->program.Base.Parameters;
 
    /* If there's no constant buffer, then no surface BO is needed to point at
     * it.
     */
-   if (fp->const_buffer == 0) {
+   if (brw->wm.const_bo == 0) {
       if (brw->wm.surf_bo[surf] != NULL) {
 	 drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
 	 brw->wm.surf_bo[surf] = NULL;
@@ -450,16 +367,20 @@
       return;
    }
 
-   brw_update_wm_constant_surface(ctx, surf);
+   brw_create_constant_surface(brw, brw->wm.const_bo, params->NumParameters,
+			       &brw->wm.surf_bo[surf],
+			       &brw->wm.surf_offset[surf]);
+   brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
 }
 
 const struct brw_tracked_state brw_wm_constant_surface = {
    .dirty = {
-      .mesa = (_NEW_PROGRAM_CONSTANTS),
-      .brw = (BRW_NEW_FRAGMENT_PROGRAM),
+      .mesa = 0,
+      .brw = (BRW_NEW_WM_CONSTBUF |
+	      BRW_NEW_BATCH),
       .cache = 0
    },
-   .prepare = prepare_wm_constant_surface,
+   .emit = upload_wm_constant_surface,
 };
 
 
@@ -488,6 +409,8 @@
       uint32_t draw_x;
       uint32_t draw_y;
    } key;
+   struct brw_surface_state surf;
+   void *map;
 
    memset(&key, 0, sizeof(key));
 
@@ -554,137 +477,123 @@
 			 (ctx->Color.BlendEnabled & (1 << unit)));
    }
 
-   drm_intel_bo_unreference(brw->wm.surf_bo[unit]);
-   brw->wm.surf_bo[unit] = brw_search_cache(&brw->surface_cache,
-					    BRW_SS_SURFACE,
-					    &key, sizeof(key),
-					    &region_bo, 1,
-					    NULL);
+   memset(&surf, 0, sizeof(surf));
 
-   if (brw->wm.surf_bo[unit] == NULL) {
-      struct brw_surface_state surf;
+   surf.ss0.surface_format = key.surface_format;
+   surf.ss0.surface_type = key.surface_type;
+   if (key.tiling == I915_TILING_NONE) {
+      surf.ss1.base_addr = (key.draw_x + key.draw_y * key.pitch) * key.cpp;
+   } else {
+      uint32_t tile_base, tile_x, tile_y;
+      uint32_t pitch = key.pitch * key.cpp;
 
-      memset(&surf, 0, sizeof(surf));
-
-      surf.ss0.surface_format = key.surface_format;
-      surf.ss0.surface_type = key.surface_type;
-      if (key.tiling == I915_TILING_NONE) {
-	 surf.ss1.base_addr = (key.draw_x + key.draw_y * key.pitch) * key.cpp;
+      if (key.tiling == I915_TILING_X) {
+	 tile_x = key.draw_x % (512 / key.cpp);
+	 tile_y = key.draw_y % 8;
+	 tile_base = ((key.draw_y / 8) * (8 * pitch));
+	 tile_base += (key.draw_x - tile_x) / (512 / key.cpp) * 4096;
       } else {
-	 uint32_t tile_base, tile_x, tile_y;
-	 uint32_t pitch = key.pitch * key.cpp;
-
-	 if (key.tiling == I915_TILING_X) {
-	    tile_x = key.draw_x % (512 / key.cpp);
-	    tile_y = key.draw_y % 8;
-	    tile_base = ((key.draw_y / 8) * (8 * pitch));
-	    tile_base += (key.draw_x - tile_x) / (512 / key.cpp) * 4096;
-	 } else {
-	    /* Y */
-	    tile_x = key.draw_x % (128 / key.cpp);
-	    tile_y = key.draw_y % 32;
-	    tile_base = ((key.draw_y / 32) * (32 * pitch));
-	    tile_base += (key.draw_x - tile_x) / (128 / key.cpp) * 4096;
-	 }
-	 assert(brw->has_surface_tile_offset || (tile_x == 0 && tile_y == 0));
-	 assert(tile_x % 4 == 0);
-	 assert(tile_y % 2 == 0);
-	 /* Note that the low bits of these fields are missing, so
-	  * there's the possibility of getting in trouble.
-	  */
-	 surf.ss1.base_addr = tile_base;
-	 surf.ss5.x_offset = tile_x / 4;
-	 surf.ss5.y_offset = tile_y / 2;
+	 /* Y */
+	 tile_x = key.draw_x % (128 / key.cpp);
+	 tile_y = key.draw_y % 32;
+	 tile_base = ((key.draw_y / 32) * (32 * pitch));
+	 tile_base += (key.draw_x - tile_x) / (128 / key.cpp) * 4096;
       }
-      if (region_bo != NULL)
-	 surf.ss1.base_addr += region_bo->offset; /* reloc */
+      assert(brw->has_surface_tile_offset || (tile_x == 0 && tile_y == 0));
+      assert(tile_x % 4 == 0);
+      assert(tile_y % 2 == 0);
+      /* Note that the low bits of these fields are missing, so
+       * there's the possibility of getting in trouble.
+       */
+      surf.ss1.base_addr = tile_base;
+      surf.ss5.x_offset = tile_x / 4;
+      surf.ss5.y_offset = tile_y / 2;
+   }
+   if (region_bo != NULL)
+      surf.ss1.base_addr += region_bo->offset; /* reloc */
 
-      surf.ss2.width = key.width - 1;
-      surf.ss2.height = key.height - 1;
-      brw_set_surface_tiling(&surf, key.tiling);
-      surf.ss3.pitch = (key.pitch * key.cpp) - 1;
+   surf.ss2.width = key.width - 1;
+   surf.ss2.height = key.height - 1;
+   brw_set_surface_tiling(&surf, key.tiling);
+   surf.ss3.pitch = (key.pitch * key.cpp) - 1;
 
-      if (intel->gen < 6) {
-	 /* _NEW_COLOR */
-	 surf.ss0.color_blend = key.color_blend;
-	 surf.ss0.writedisable_red =   !key.color_mask[0];
-	 surf.ss0.writedisable_green = !key.color_mask[1];
-	 surf.ss0.writedisable_blue =  !key.color_mask[2];
-	 surf.ss0.writedisable_alpha = !key.color_mask[3];
-      }
+   if (intel->gen < 6) {
+      /* _NEW_COLOR */
+      surf.ss0.color_blend = key.color_blend;
+      surf.ss0.writedisable_red =   !key.color_mask[0];
+      surf.ss0.writedisable_green = !key.color_mask[1];
+      surf.ss0.writedisable_blue =  !key.color_mask[2];
+      surf.ss0.writedisable_alpha = !key.color_mask[3];
+   }
 
-      /* Key size will never match key size for textures, so we're safe. */
-      brw->wm.surf_bo[unit] = brw_upload_cache(&brw->surface_cache,
-                                               BRW_SS_SURFACE,
-                                               &key, sizeof(key),
-					       &region_bo, 1,
-					       &surf, sizeof(surf));
-      if (region_bo != NULL) {
-	 /* We might sample from it, and we might render to it, so flag
-	  * them both.  We might be able to figure out from other state
-	  * a more restrictive relocation to emit.
-	  */
-	 drm_intel_bo_emit_reloc(brw->wm.surf_bo[unit],
-				 offsetof(struct brw_surface_state, ss1),
-				 region_bo,
-				 surf.ss1.base_addr - region_bo->offset,
-				 I915_GEM_DOMAIN_RENDER,
-				 I915_GEM_DOMAIN_RENDER);
-      }
+   map = brw_state_batch(brw, sizeof(surf), 32,
+			 &brw->wm.surf_bo[unit],
+			 &brw->wm.surf_offset[unit]);
+   memcpy(map, &surf, sizeof(surf));
+
+   if (region_bo != NULL) {
+      drm_intel_bo_emit_reloc(brw->wm.surf_bo[unit],
+			      brw->wm.surf_offset[unit] +
+			      offsetof(struct brw_surface_state, ss1),
+			      region_bo,
+			      surf.ss1.base_addr - region_bo->offset,
+			      I915_GEM_DOMAIN_RENDER,
+			      I915_GEM_DOMAIN_RENDER);
    }
 }
 
+static void
+prepare_wm_surfaces(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   int i;
+   int nr_surfaces = 0;
+
+   if (ctx->DrawBuffer->_NumColorDrawBuffers >= 1) {
+      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+	 struct intel_region *region = irb ? irb->region : NULL;
+
+	 brw_add_validated_bo(brw, region->buffer);
+	 nr_surfaces = SURF_INDEX_DRAW(i) + 1;
+      }
+   }
+
+   if (brw->wm.const_bo) {
+      brw_add_validated_bo(brw, brw->wm.const_bo);
+      nr_surfaces = SURF_INDEX_FRAG_CONST_BUFFER + 1;
+   }
+
+   for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
+      const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
+      struct gl_texture_object *tObj = texUnit->_Current;
+      struct intel_texture_object *intelObj = intel_texture_object(tObj);
+
+      if (texUnit->_ReallyEnabled) {
+	 brw_add_validated_bo(brw, intelObj->mt->region->buffer);
+	 nr_surfaces = SURF_INDEX_TEXTURE(i) + 1;
+      }
+   }
+
+   /* Have to update this in our prepare, since the unit's prepare
+    * relies on it.
+    */
+   if (brw->wm.nr_surfaces != nr_surfaces) {
+      brw->wm.nr_surfaces = nr_surfaces;
+      brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
+   }
+}
 
 /**
- * Constructs the binding table for the WM surface state, which maps unit
- * numbers to surface state objects.
+ * Constructs the set of surface state objects pointed to by the
+ * binding table.
  */
-static drm_intel_bo *
-brw_wm_get_binding_table(struct brw_context *brw)
-{
-   drm_intel_bo *bind_bo;
-
-   assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);
-
-   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
-			      NULL, 0,
-			      brw->wm.surf_bo, brw->wm.nr_surfaces,
-			      NULL);
-
-   if (bind_bo == NULL) {
-      GLuint data_size = brw->wm.nr_surfaces * sizeof(GLuint);
-      uint32_t data[BRW_WM_MAX_SURF];
-      int i;
-
-      for (i = 0; i < brw->wm.nr_surfaces; i++)
-         if (brw->wm.surf_bo[i])
-            data[i] = brw->wm.surf_bo[i]->offset;
-         else
-            data[i] = 0;
-
-      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
-				  NULL, 0,
-				  brw->wm.surf_bo, brw->wm.nr_surfaces,
-				  data, data_size);
-
-      /* Emit binding table relocations to surface state */
-      for (i = 0; i < BRW_WM_MAX_SURF; i++) {
-	 if (brw->wm.surf_bo[i] != NULL) {
-	    drm_intel_bo_emit_reloc(bind_bo, i * sizeof(GLuint),
-				    brw->wm.surf_bo[i], 0,
-				    I915_GEM_DOMAIN_INSTRUCTION, 0);
-	 }
-      }
-   }
-
-   return bind_bo;
-}
-
-static void prepare_wm_surfaces(struct brw_context *brw )
+static void
+upload_wm_surfaces(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    GLuint i;
-   int old_nr_surfaces;
 
    /* _NEW_BUFFERS | _NEW_COLOR */
    /* Update surfaces for drawing buffers */
@@ -698,32 +607,21 @@
       brw_update_renderbuffer_surface(brw, NULL, 0);
    }
 
-   old_nr_surfaces = brw->wm.nr_surfaces;
-   brw->wm.nr_surfaces = BRW_MAX_DRAW_BUFFERS;
-
-   if (brw->wm.surf_bo[SURF_INDEX_FRAG_CONST_BUFFER] != NULL)
-       brw->wm.nr_surfaces = SURF_INDEX_FRAG_CONST_BUFFER + 1;
-
    /* Update surfaces for textures */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
       const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
       const GLuint surf = SURF_INDEX_TEXTURE(i);
 
-      /* _NEW_TEXTURE, BRW_NEW_TEXDATA */
+      /* _NEW_TEXTURE */
       if (texUnit->_ReallyEnabled) {
 	 brw_update_texture_surface(ctx, i);
-	 brw->wm.nr_surfaces = surf + 1;
       } else {
          drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
          brw->wm.surf_bo[surf] = NULL;
       }
    }
 
-   drm_intel_bo_unreference(brw->wm.bind_bo);
-   brw->wm.bind_bo = brw_wm_get_binding_table(brw);
-
-   if (brw->wm.nr_surfaces != old_nr_surfaces)
-      brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
+   brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
 }
 
 const struct brw_tracked_state brw_wm_surfaces = {
@@ -731,12 +629,48 @@
       .mesa = (_NEW_COLOR |
                _NEW_TEXTURE |
                _NEW_BUFFERS),
-      .brw = (BRW_NEW_CONTEXT |
-	      BRW_NEW_WM_SURFACES),
+      .brw = (BRW_NEW_BATCH),
       .cache = 0
    },
    .prepare = prepare_wm_surfaces,
+   .emit = upload_wm_surfaces,
 };
 
+/**
+ * Constructs the binding table for the WM surface state, which maps unit
+ * numbers to surface state objects.
+ */
+static void
+brw_wm_upload_binding_table(struct brw_context *brw)
+{
+   uint32_t *bind;
+   int i;
 
+   /* Might want to calculate nr_surfaces first, to avoid taking up so much
+    * space for the binding table.
+    */
+   bind = brw_state_batch(brw, sizeof(uint32_t) * BRW_WM_MAX_SURF,
+			  32, &brw->wm.bind_bo, &brw->wm.bind_bo_offset);
 
+   for (i = 0; i < BRW_WM_MAX_SURF; i++) {
+      /* BRW_NEW_WM_SURFACES */
+      bind[i] = brw->wm.surf_offset[i];
+      if (brw->wm.surf_bo[i]) {
+	 bind[i] = brw->wm.surf_offset[i];
+      } else {
+	 bind[i] = 0;
+      }
+   }
+
+   brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE;
+}
+
+const struct brw_tracked_state brw_wm_binding_table = {
+   .dirty = {
+      .mesa = 0,
+      .brw = (BRW_NEW_BATCH |
+	      BRW_NEW_WM_SURFACES),
+      .cache = 0
+   },
+   .emit = brw_wm_upload_binding_table,
+};
diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
index 2e21e5f..34a9dc2 100644
--- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
@@ -63,8 +63,7 @@
 
    drm_intel_bo_unreference(brw->sf.state_bo);
    brw->sf.state_bo = brw_cache_data(&brw->cache, BRW_SF_UNIT,
-				     &scissor, sizeof(scissor),
-				     NULL, 0);
+				     &scissor, sizeof(scissor));
 }
 
 const struct brw_tracked_state gen6_scissor_state = {
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 8d96b44..51940ef 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -87,7 +87,7 @@
    if (ctx->Polygon.CullFlag) {
       switch (ctx->Polygon.CullFaceMode) {
       case GL_FRONT:
-	 dw3 |= GEN6_SF_CULL_BOTH;
+	 dw3 |= GEN6_SF_CULL_FRONT;
 	 break;
       case GL_BACK:
 	 dw3 |= GEN6_SF_CULL_BACK;
diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
index 0c2aa42..301c68e 100644
--- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
@@ -29,7 +29,6 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "intel_batchbuffer.h"
-#include "main/macros.h"
 
 /* The clip VP defines the guardband region where expensive clipping is skipped
  * and fragments are allowed to be generated and clipped out cheaply by the SF.
@@ -51,8 +50,7 @@
 
    drm_intel_bo_unreference(brw->clip.vp_bo);
    brw->clip.vp_bo = brw_cache_data(&brw->cache, BRW_CLIP_VP,
-				    &vp, sizeof(vp),
-				    NULL, 0);
+				    &vp, sizeof(vp));
 }
 
 const struct brw_tracked_state gen6_clip_vp = {
@@ -95,8 +93,7 @@
 
    drm_intel_bo_unreference(brw->sf.vp_bo);
    brw->sf.vp_bo = brw_cache_data(&brw->cache, BRW_SF_VP,
-				  &sfv, sizeof(sfv),
-				  NULL, 0);
+				  &sfv, sizeof(sfv));
 }
 
 const struct brw_tracked_state gen6_sf_vp = {
@@ -108,36 +105,6 @@
    .prepare = prepare_sf_vp,
 };
 
-static void
-prepare_cc_vp(struct brw_context *brw)
-{
-   GLcontext *ctx = &brw->intel.ctx;
-   struct brw_cc_viewport ccv;
-
-   /* _NEW_TRANSOFORM */
-   if (ctx->Transform.DepthClamp) {
-      /* _NEW_VIEWPORT */
-      ccv.min_depth = MIN2(ctx->Viewport.Near, ctx->Viewport.Far);
-      ccv.max_depth = MAX2(ctx->Viewport.Near, ctx->Viewport.Far);
-   } else {
-      ccv.min_depth = 0.0;
-      ccv.max_depth = 1.0;
-   }
-
-   drm_intel_bo_unreference(brw->cc.vp_bo);
-   brw->cc.vp_bo = brw_cache_data(&brw->cache, BRW_CC_VP, &ccv, sizeof(ccv),
-				  NULL, 0);
-}
-
-const struct brw_tracked_state gen6_cc_vp = {
-   .dirty = {
-      .mesa = _NEW_VIEWPORT | _NEW_TRANSFORM,
-      .brw = 0,
-      .cache = 0,
-   },
-   .prepare = prepare_cc_vp,
-};
-
 static void prepare_viewport_state_pointers(struct brw_context *brw)
 {
    brw_add_validated_bo(brw, brw->sf.state_bo);
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 325f6b4..863c854 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -98,7 +98,8 @@
 
    /* CACHE_NEW_SAMPLER */
    dw2 |= (ALIGN(brw->wm.sampler_count, 4) / 4) << GEN6_WM_SAMPLER_COUNT_SHIFT;
-   dw4 |= (1 << GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
+   dw4 |= (brw->wm.prog_data->first_curbe_grf <<
+	   GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
 
    dw5 |= (40 - 1) << GEN6_WM_MAX_THREADS_SHIFT;
    dw5 |= GEN6_WM_DISPATCH_ENABLE;
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index 1116bcc..698445c 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -49,6 +49,7 @@
    batch->ptr = batch->map;
    batch->reserved_space = BATCH_RESERVED;
    batch->dirty_state = ~0;
+   batch->state_batch_offset = batch->size;
 }
 
 struct intel_batchbuffer *
@@ -84,6 +85,12 @@
    int x_off = 0, y_off = 0;
 
    drm_intel_bo_subdata(batch->buf, 0, used, batch->buffer);
+   if (batch->state_batch_offset != batch->size) {
+      drm_intel_bo_subdata(batch->buf,
+			   batch->state_batch_offset,
+			   batch->size - batch->state_batch_offset,
+			   batch->buffer + batch->state_batch_offset);
+   }
 
    batch->ptr = NULL;
 
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.h b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
index f4ac182..ae53f45 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
@@ -23,6 +23,7 @@
    GLubyte *ptr;
 
    GLuint size;
+   uint32_t state_batch_offset;
 
 #ifdef DEBUG
    /** Tracking of BEGIN_BATCH()/OUT_BATCH()/ADVANCE_BATCH() debugging */
@@ -92,7 +93,8 @@
 static INLINE GLint
 intel_batchbuffer_space(struct intel_batchbuffer *batch)
 {
-   return (batch->size - batch->reserved_space) - (batch->ptr - batch->map);
+   return (batch->state_batch_offset - batch->reserved_space) -
+      (batch->ptr - batch->map);
 }
 
 
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index dec4797..5f2035d 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -827,8 +827,6 @@
 
    assert(intel);               /* should never be null */
    if (intel) {
-      GLboolean release_texture_heaps;
-
       INTEL_FIREVERTICES(intel);
 
       _mesa_meta_free(&intel->ctx);
@@ -837,7 +835,6 @@
 
       intel->vtbl.destroy(intel);
 
-      release_texture_heaps = (intel->ctx.Shared->RefCount == 1);
       _swsetup_DestroyContext(&intel->ctx);
       _tnl_DestroyContext(&intel->ctx);
       _vbo_DestroyContext(&intel->ctx);
@@ -855,18 +852,6 @@
       drm_intel_bo_unreference(intel->first_post_swapbuffers_batch);
       intel->first_post_swapbuffers_batch = NULL;
 
-      if (release_texture_heaps) {
-         /* Nothing is currently done here to free texture heaps;
-          * but we're not using the texture heap utilities, so I
-          * rather think we shouldn't.  I've taken a look, and can't
-          * find any private texture data hanging around anywhere, but
-          * I'm not yet certain there isn't any at all...
-          */
-         /* if (INTEL_DEBUG & DEBUG_TEXTURE)
-            fprintf(stderr, "do something to free texture heaps\n");
-          */
-      }
-
       driDestroyOptionCache(&intel->optionCache);
 
       /* free the Mesa context */
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index 14ff4a9..c7ac2de 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -261,6 +261,8 @@
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
 #define ALIGN(value, alignment)  ((value + alignment - 1) & ~(alignment - 1))
+#define ROUND_DOWN_TO(value, alignment) (ALIGN(value - alignment - 1, \
+					       alignment))
 #define IS_POWER_OF_TWO(val) (((val) & (val - 1)) == 0)
 
 static INLINE uint32_t
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index 8f61f1f..4a83886 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -42,9 +42,6 @@
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
 #include "intel_regions.h"
-#ifndef I915
-#include "brw_state.h"
-#endif
 
 #define FILE_DEBUG_FLAG DEBUG_FBO
 
@@ -296,12 +293,6 @@
    old = rb->region;
    rb->region = NULL;
    intel_region_reference(&rb->region, region);
-#ifndef I915
-   if (old) {
-      brw_state_cache_bo_delete(&brw_context(&intel->ctx)->surface_cache,
-				old->buffer);
-   }
-#endif
    intel_region_release(&old);
 }
 
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
index 71ef7a8..39ac020 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
@@ -29,9 +29,6 @@
 #include "intel_mipmap_tree.h"
 #include "intel_regions.h"
 #include "intel_tex_layout.h"
-#ifndef I915
-#include "brw_state.h"
-#endif
 #include "main/enums.h"
 
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
@@ -203,19 +200,6 @@
 
       DBG("%s deleting %p\n", __FUNCTION__, *mt);
 
-#ifndef I915
-      /* Free up cached binding tables holding a reference on our buffer, to
-       * avoid excessive memory consumption.
-       *
-       * This isn't as aggressive as we could be, as we'd like to do
-       * it from any time we free the last ref on a region.  But intel_region.c
-       * is context-agnostic.  Perhaps our constant state cache should be, as
-       * well.
-       */
-      brw_state_cache_bo_delete(&brw_context(&intel->ctx)->surface_cache,
-				(*mt)->region->buffer);
-#endif
-
       intel_region_release(&((*mt)->region));
 
       for (i = 0; i < MAX_TEXTURE_LEVELS; i++) {
diff --git a/src/mesa/drivers/dri/intel/intel_span.c b/src/mesa/drivers/dri/intel/intel_span.c
index c30552c..fb840c1 100644
--- a/src/mesa/drivers/dri/intel/intel_span.c
+++ b/src/mesa/drivers/dri/intel/intel_span.c
@@ -257,6 +257,8 @@
    for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
       if (ctx->Texture.Unit[i]._ReallyEnabled) {
          struct gl_texture_object *texObj = ctx->Texture.Unit[i]._Current;
+
+         intel_finalize_mipmap_tree(intel, i);
          intel_tex_map_images(intel, intel_texture_object(texObj));
       }
    }
diff --git a/src/mesa/drivers/dri/r300/compiler/Makefile b/src/mesa/drivers/dri/r300/compiler/Makefile
index 34d22b4..ff3801d 100644
--- a/src/mesa/drivers/dri/r300/compiler/Makefile
+++ b/src/mesa/drivers/dri/r300/compiler/Makefile
@@ -9,6 +9,7 @@
 		radeon_code.c \
 		radeon_compiler.c \
 		radeon_emulate_branches.c \
+		radeon_emulate_loops.c \
 		radeon_program.c \
 		radeon_program_print.c \
 		radeon_opcodes.c \
diff --git a/src/mesa/drivers/dri/r300/compiler/SConscript b/src/mesa/drivers/dri/r300/compiler/SConscript
index 663926e..50d9cdb 100755
--- a/src/mesa/drivers/dri/r300/compiler/SConscript
+++ b/src/mesa/drivers/dri/r300/compiler/SConscript
@@ -23,6 +23,7 @@
         'radeon_pair_regalloc.c',
         'radeon_optimize.c',
         'radeon_emulate_branches.c',
+        'radeon_emulate_loops.c',
         'radeon_dataflow.c',
         'radeon_dataflow_deadcode.c',
         'radeon_dataflow_swizzles.c',
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
index 7f3b88e..bbdfa0d 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -26,6 +26,7 @@
 
 #include "radeon_dataflow.h"
 #include "radeon_emulate_branches.h"
+#include "radeon_emulate_loops.h"
 #include "radeon_program_alu.h"
 #include "radeon_program_tex.h"
 #include "r300_fragprog.h"
@@ -103,6 +104,14 @@
 	/* XXX Ideally this should be done only for r3xx, but since
 	 * we don't have branching support for r5xx, we use the emulation
 	 * on all chipsets. */
+	
+	if (c->Base.is_r500) {
+		rc_emulate_loops(&c->Base, R500_PFS_MAX_INST);
+	} else {
+		rc_emulate_loops(&c->Base, R300_PFS_MAX_ALU_INST);
+	}
+	debug_program_log(c, "after emulate loops");
+
 	rc_emulate_branches(&c->Base);
 
 	debug_program_log(c, "after emulate branches");
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
index 507b2e5..e984797 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
@@ -30,6 +30,7 @@
 #include "radeon_program_alu.h"
 #include "radeon_swizzle.h"
 #include "radeon_emulate_branches.h"
+#include "radeon_emulate_loops.h"
 
 /*
  * Take an already-setup and valid source then swizzle it appropriately to
@@ -348,7 +349,8 @@
 		if (!valid_dst(compiler->code, &vpi->DstReg))
 			continue;
 
-		if (compiler->code->length >= VSF_MAX_FRAGMENT_LENGTH) {
+		if (compiler->code->length >= R500_VS_MAX_ALU_DWORDS ||
+		    (compiler->code->length >= R300_VS_MAX_ALU_DWORDS && !compiler->Base.is_r500)) {
 			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
 			return;
 		}
@@ -404,7 +406,7 @@
 {
 	struct rc_instruction *inst;
 	unsigned int num_orig_temps = 0;
-	char hwtemps[VSF_MAX_FRAGMENT_TEMPS];
+	char hwtemps[R300_VS_MAX_TEMPS];
 	struct temporary_allocation * ta;
 	unsigned int i, j;
 
@@ -463,11 +465,11 @@
 				unsigned int orig = inst->U.I.DstReg.Index;
 
 				if (!ta[orig].Allocated) {
-					for(j = 0; j < VSF_MAX_FRAGMENT_TEMPS; ++j) {
+					for(j = 0; j < R300_VS_MAX_TEMPS; ++j) {
 						if (!hwtemps[j])
 							break;
 					}
-					if (j >= VSF_MAX_FRAGMENT_TEMPS) {
+					if (j >= R300_VS_MAX_TEMPS) {
 						fprintf(stderr, "Out of hw temporaries\n");
 					} else {
 						ta[orig].Allocated = 1;
@@ -600,6 +602,13 @@
 	/* XXX Ideally this should be done only for r3xx, but since
 	 * we don't have branching support for r5xx, we use the emulation
 	 * on all chipsets. */
+	if (compiler->Base.is_r500){
+		rc_emulate_loops(&compiler->Base, R500_VS_MAX_ALU);
+	} else {
+		rc_emulate_loops(&compiler->Base, R300_VS_MAX_ALU);
+	}
+	debug_program_log(compiler, "after emulate loops");
+
 	rc_emulate_branches(&compiler->Base);
 
 	debug_program_log(compiler, "after emulate branches");
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
index 1979e7e..d036897 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_code.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
@@ -235,8 +235,11 @@
 };
 
 
-#define VSF_MAX_FRAGMENT_LENGTH (255*4)
-#define VSF_MAX_FRAGMENT_TEMPS (14)
+#define R300_VS_MAX_ALU		256
+#define R300_VS_MAX_ALU_DWORDS  (R300_VS_MAX_ALU * 4)
+#define R500_VS_MAX_ALU	        1024
+#define R500_VS_MAX_ALU_DWORDS  (R500_VS_MAX_ALU * 4)
+#define R300_VS_MAX_TEMPS	32
 
 #define VSF_MAX_INPUTS 32
 #define VSF_MAX_OUTPUTS 32
@@ -244,8 +247,8 @@
 struct r300_vertex_program_code {
 	int length;
 	union {
-		uint32_t d[VSF_MAX_FRAGMENT_LENGTH];
-		float f[VSF_MAX_FRAGMENT_LENGTH];
+		uint32_t d[R500_VS_MAX_ALU_DWORDS];
+		float f[R500_VS_MAX_ALU_DWORDS];
 	} body;
 
 	int pos_end;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
new file mode 100644
index 0000000..4c5d29f
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
@@ -0,0 +1,474 @@
+/*
+ * Copyright 2010 Tom Stellard <tstellar@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ */
+
+#include "radeon_emulate_loops.h"
+
+#include "radeon_compiler.h"
+#include "radeon_dataflow.h"
+
+#define VERBOSE 0
+
+#define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)
+
+struct emulate_loop_state {
+	struct radeon_compiler * C;
+	struct loop_info * Loops;
+	unsigned int LoopCount;
+	unsigned int LoopReserved;
+};
+
+struct loop_info {
+	struct rc_instruction * BeginLoop;
+	struct rc_instruction * Cond;
+	struct rc_instruction * If;
+	struct rc_instruction * Brk;
+	struct rc_instruction * EndIf;
+	struct rc_instruction * EndLoop;
+};
+
+struct const_value {
+	
+	struct radeon_compiler * C;
+	struct rc_src_register * Src;
+	float Value;
+	int HasValue;
+};
+
+struct count_inst {
+	struct radeon_compiler * C;
+	int Index;
+	rc_swizzle Swz;
+	float Amount;
+	int Unknown;
+};
+
+static float get_constant_value(struct radeon_compiler * c,
+						struct rc_src_register * src,
+						int chan)
+{
+	float base = 1.0f;
+	int swz = GET_SWZ(src->Swizzle, chan);
+	if(swz >= 4 || src->Index >= c->Program.Constants.Count ){
+		rc_error(c, "get_constant_value: Can't find a value.\n");
+		return 0.0f;
+	}
+	if(GET_BIT(src->Negate, chan)){
+		base = -1.0f;
+	}
+	return base *
+		c->Program.Constants.Constants[src->Index].u.Immediate[swz];
+}
+
+static int src_reg_is_immediate(struct rc_src_register * src,
+						struct radeon_compiler * c)
+{
+	return src->File == RC_FILE_CONSTANT &&
+	c->Program.Constants.Constants[src->Index].Type==RC_CONSTANT_IMMEDIATE;
+}
+
+static unsigned int loop_count_instructions(struct loop_info * loop)
+{
+	unsigned int count = 0;
+	struct rc_instruction * inst = loop->BeginLoop->Next;
+	while(inst != loop->EndLoop){
+		count++;
+		inst = inst->Next;
+	}
+	return count;
+}
+
+static unsigned int loop_calc_iterations(struct loop_info * loop,
+		unsigned int loop_count, unsigned int max_instructions)
+{
+	unsigned int icount = loop_count_instructions(loop);
+	return max_instructions / (loop_count * icount);
+}
+
+static void loop_unroll(struct emulate_loop_state * s,
+			struct loop_info *loop, unsigned int iterations)
+{
+	unsigned int i;
+	struct rc_instruction * ptr;
+	struct rc_instruction * first = loop->BeginLoop->Next;
+	struct rc_instruction * last = loop->EndLoop->Prev;
+	struct rc_instruction * append_to = last;
+	rc_remove_instruction(loop->BeginLoop);
+	rc_remove_instruction(loop->EndLoop);
+	for( i = 1; i < iterations; i++){
+		for(ptr = first; ptr != last->Next; ptr = ptr->Next){
+			struct rc_instruction *new = rc_alloc_instruction(s->C);
+			memcpy(new, ptr, sizeof(struct rc_instruction));
+			rc_insert_instruction(append_to, new);
+			append_to = new;
+		}
+	}
+}
+
+
+static void update_const_value(void * data, struct rc_instruction * inst,
+		rc_register_file file, unsigned int index, unsigned int mask)
+{
+	struct const_value * value = data;
+	if(value->Src->File != file ||
+	   value->Src->Index != index ||
+	   !(1 << GET_SWZ(value->Src->Swizzle, 0) & mask)){
+	   	return;
+	}
+	switch(inst->U.I.Opcode){
+	case RC_OPCODE_MOV:
+		if(!src_reg_is_immediate(&inst->U.I.SrcReg[0], value->C)){
+			return;
+		}
+		value->HasValue = 1;
+		value->Value =
+			get_constant_value(value->C, &inst->U.I.SrcReg[0], 0);
+		break;
+	}
+}
+
+static void get_incr_amount(void * data, struct rc_instruction * inst,
+		rc_register_file file, unsigned int index, unsigned int mask)
+{
+	struct count_inst * count_inst = data;
+	int amnt_src_index;
+	const struct rc_opcode_info * opcode;
+	float amount;
+
+	if(file != RC_FILE_TEMPORARY ||
+	   count_inst->Index != index ||
+	   (1 << GET_SWZ(count_inst->Swz,0) != mask)){
+	   	return;
+	}
+	/* Find the index of the counter register. */
+	opcode = rc_get_opcode_info(inst->U.I.Opcode);
+	if(opcode->NumSrcRegs != 2){
+		count_inst->Unknown = 1;
+		return;
+	}
+	if(inst->U.I.SrcReg[0].File == RC_FILE_TEMPORARY &&
+	   inst->U.I.SrcReg[0].Index == count_inst->Index &&
+	   inst->U.I.SrcReg[0].Swizzle == count_inst->Swz){
+		amnt_src_index = 1;
+	} else if( inst->U.I.SrcReg[1].File == RC_FILE_TEMPORARY &&
+		   inst->U.I.SrcReg[1].Index == count_inst->Index &&
+		   inst->U.I.SrcReg[1].Swizzle == count_inst->Swz){
+		amnt_src_index = 0;
+	}
+	else{
+		count_inst->Unknown = 1;
+		return;
+	}
+	if(src_reg_is_immediate(&inst->U.I.SrcReg[amnt_src_index],
+							count_inst->C)){
+		amount = get_constant_value(count_inst->C,
+				&inst->U.I.SrcReg[amnt_src_index], 0);
+	}
+	else{
+		count_inst->Unknown = 1 ;
+		return;
+	}
+	switch(inst->U.I.Opcode){
+	case RC_OPCODE_ADD:
+		count_inst->Amount += amount;
+		break;
+	case RC_OPCODE_SUB:
+		if(amnt_src_index == 0){
+			count_inst->Unknown = 0;
+			return;
+		}
+		count_inst->Amount -= amount;
+		break;
+	default:
+		count_inst->Unknown = 1;
+		return;
+	}
+	
+}
+
+static int transform_const_loop(struct emulate_loop_state * s,
+						struct loop_info * loop,
+						struct rc_instruction * cond)
+{
+	int end_loops = 1;
+	int iterations;
+	struct count_inst count_inst;
+	float limit_value;
+	struct rc_src_register * counter;
+	struct rc_src_register * limit;
+	struct const_value counter_value;
+	struct rc_instruction * inst;
+
+	/* Find the counter and the upper limit */
+	
+	if(src_reg_is_immediate(&cond->U.I.SrcReg[0], s->C)){
+		limit = &cond->U.I.SrcReg[0];
+		counter = &cond->U.I.SrcReg[1];
+	}
+	else if(src_reg_is_immediate(&cond->U.I.SrcReg[1], s->C)){
+		limit = &cond->U.I.SrcReg[1];
+		counter = &cond->U.I.SrcReg[0];
+	}
+	else{
+		DBG("No constant limit.\n");
+		return 0;
+	}
+	
+	/* Find the initial value of the counter */
+	counter_value.Src = counter;
+	counter_value.Value = 0.0f;
+	counter_value.HasValue = 0;
+	counter_value.C = s->C;
+	for(inst = s->C->Program.Instructions.Next; inst != loop->BeginLoop;
+							inst = inst->Next){
+		rc_for_all_writes_mask(inst, update_const_value, &counter_value);
+	}
+	if(!counter_value.HasValue){
+		DBG("Initial counter value cannot be determined.\n");
+		return 0;
+	}
+	DBG("Initial counter value is %f\n", counter_value.Value);
+	/* Determine how the counter is modified each loop */
+	count_inst.C = s->C;
+	count_inst.Index = counter->Index;
+	count_inst.Swz = counter->Swizzle;
+	count_inst.Amount = 0.0f;
+	count_inst.Unknown = 0;
+	for(inst = loop->BeginLoop->Next; end_loops > 0; inst = inst->Next){
+		switch(inst->U.I.Opcode){
+		/* XXX In the future we might want to try to unroll nested
+		 * loops here.*/
+		case RC_OPCODE_BGNLOOP:
+			end_loops++;
+			break;
+		case RC_OPCODE_ENDLOOP:
+			loop->EndLoop = inst;
+			end_loops--;
+			break;
+		/* XXX Check if the counter is modified within an if statement.
+		 */
+		case RC_OPCODE_IF:
+			break;
+		default:
+			rc_for_all_writes_mask(inst, get_incr_amount, &count_inst);
+			if(count_inst.Unknown){
+				return 0;
+			}
+			break;
+		}
+	}
+	/* Infinite loop */
+	if(count_inst.Amount == 0.0f){
+		return 0;
+	}
+	DBG("Counter is increased by %f each iteration.\n", count_inst.Amount);
+	/* Calculate the number of iterations of this loop.  Keeping this
+	 * simple, since we only support increment and decrement loops.
+	 */
+	limit_value = get_constant_value(s->C, limit, 0);
+	iterations = (int) ((limit_value - counter_value.Value) /
+							count_inst.Amount);
+
+	DBG("Loop will have %d iterations.\n", iterations);
+	
+	/* Prepare loop for unrolling */
+	rc_remove_instruction(loop->Cond);
+	rc_remove_instruction(loop->If);
+	rc_remove_instruction(loop->Brk);
+	rc_remove_instruction(loop->EndIf);
+	
+	loop_unroll(s, loop, iterations);
+	loop->EndLoop = NULL;
+	return 1;
+}
+
+/** 
+ * This function prepares a loop to be unrolled by converting it into an if
+ * statement.  Here is an outline of the conversion process:
+ * BGNLOOP;                         	-> BGNLOOP;
+ * <Additional conditional code>	-> <Additional conditional code>
+ * SGE/SLT temp[0], temp[1], temp[2];	-> SLT/SGE temp[0], temp[1], temp[2];
+ * IF temp[0];                      	-> IF temp[0];
+ * BRK;                             	->
+ * ENDIF;                           	-> <Loop Body>
+ * <Loop Body>                      	-> ENDIF;
+ * ENDLOOP;                         	-> ENDLOOP
+ *
+ * @param inst A pointer to a BGNLOOP instruction.
+ * @return If the loop can be unrolled, a pointer to the first instruction of
+ * 		the unrolled loop.
+ * 	   Otherwise, A pointer to the ENDLOOP instruction.
+ * 	   Null if there is an error.
+ */
+static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
+						struct rc_instruction * inst)
+{
+	struct loop_info *loop;
+	struct rc_instruction * ptr;
+
+	memory_pool_array_reserve(&s->C->Pool, struct loop_info,
+			s->Loops, s->LoopCount, s->LoopReserved, 1);
+
+	loop = &s->Loops[s->LoopCount++];
+	memset(loop, 0, sizeof(struct loop_info));
+	if(inst->U.I.Opcode != RC_OPCODE_BGNLOOP){
+		rc_error(s->C, "expected BGNLOOP\n", __FUNCTION__);
+		return NULL;
+	}
+	loop->BeginLoop = inst;
+
+	for(ptr = loop->BeginLoop->Next; !loop->EndLoop; ptr = ptr->Next){
+		switch(ptr->U.I.Opcode){
+		case RC_OPCODE_BGNLOOP:
+			/* Nested loop */
+			ptr = transform_loop(s, ptr);
+			if(!ptr){
+				return NULL;
+			}
+			break;
+		case RC_OPCODE_BRK:
+			loop->Brk = ptr;
+			if(ptr->Next->U.I.Opcode != RC_OPCODE_ENDIF){
+				rc_error(s->C,
+					"%s: expected ENDIF\n",__FUNCTION__);
+				return NULL;
+			}
+			loop->EndIf = ptr->Next;
+			if(ptr->Prev->U.I.Opcode != RC_OPCODE_IF){
+				rc_error(s->C,
+					"%s: expected IF\n", __FUNCTION__);
+				return NULL;
+			}
+			loop->If = ptr->Prev;
+			switch(loop->If->Prev->U.I.Opcode){
+			case RC_OPCODE_SLT:
+			case RC_OPCODE_SGE:
+			case RC_OPCODE_SGT:
+			case RC_OPCODE_SLE:
+			case RC_OPCODE_SEQ:
+			case RC_OPCODE_SNE:
+				break;
+			default:
+				rc_error(s->C, "%s expected conditional\n",
+								__FUNCTION__);
+				return NULL;
+			}
+			loop->Cond = loop->If->Prev;
+			ptr = loop->EndIf;
+			break;
+		case RC_OPCODE_ENDLOOP:
+			loop->EndLoop = ptr;
+			break;
+		}
+	}
+	/* Reverse the conditional instruction */
+	switch(loop->Cond->U.I.Opcode){
+	case RC_OPCODE_SGE:
+		loop->Cond->U.I.Opcode = RC_OPCODE_SLT;
+		break;
+	case RC_OPCODE_SLT:
+		loop->Cond->U.I.Opcode = RC_OPCODE_SGE;
+		break;
+	case RC_OPCODE_SLE:
+		loop->Cond->U.I.Opcode = RC_OPCODE_SGT;
+		break;
+	case RC_OPCODE_SGT:
+		loop->Cond->U.I.Opcode = RC_OPCODE_SLE;
+		break;
+	case RC_OPCODE_SEQ:
+		loop->Cond->U.I.Opcode = RC_OPCODE_SNE;
+		break;
+	case RC_OPCODE_SNE:
+		loop->Cond->U.I.Opcode = RC_OPCODE_SEQ;
+		break;
+	default:
+		rc_error(s->C, "loop->Cond is not a conditional.\n");
+		return NULL;
+	}
+	
+	/* Check if the number of loops is known at compile time. */
+	if(transform_const_loop(s, loop, ptr)){
+		return loop->BeginLoop->Next;
+	}
+
+	/* Prepare the loop to be unrolled */
+	rc_remove_instruction(loop->Brk);
+	rc_remove_instruction(loop->EndIf);
+	rc_insert_instruction(loop->EndLoop->Prev, loop->EndIf);
+	return loop->EndLoop;
+}
+
+static void rc_transform_loops(struct emulate_loop_state * s)
+{
+	struct rc_instruction * ptr = s->C->Program.Instructions.Next;
+	while(ptr != &s->C->Program.Instructions) {
+		if(ptr->Type == RC_INSTRUCTION_NORMAL &&
+					ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){
+			ptr = transform_loop(s, ptr);
+			if(!ptr){
+				return;
+			}
+		}
+		ptr = ptr->Next;
+	}
+}
+
+static void rc_unroll_loops(struct emulate_loop_state *s,
+						unsigned int max_instructions)
+{
+	int i;
+	/* Iterate backwards of the list of loops so that loops that nested
+	 * loops are unrolled first.
+	 */
+	for( i = s->LoopCount - 1; i >= 0; i-- ){
+		if(!s->Loops[i].EndLoop){
+			continue;
+		}
+		unsigned int iterations = loop_calc_iterations(&s->Loops[i],
+						s->LoopCount, max_instructions);
+		loop_unroll(s, &s->Loops[i], iterations);
+	}
+}
+
+void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions)
+{
+	struct emulate_loop_state s;
+
+	memset(&s, 0, sizeof(struct emulate_loop_state));
+	s.C = c;
+
+	/* We may need to move these two operations to r3xx_(vert|frag)prog.c
+	 * and run the optimization passes between them in order to increase
+	 * the number of unrolls we can do for each loop.
+	 */
+	rc_transform_loops(&s);
+	
+	rc_unroll_loops(&s, max_instructions);
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
new file mode 100644
index 0000000..ddcf1c0
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
@@ -0,0 +1,12 @@
+
+
+#ifndef RADEON_EMULATE_LOOPS_H
+#define RADEON_EMULATE_LOOPS_H
+
+#define MAX_ITERATIONS 8
+
+struct radeon_compiler;
+
+void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions);
+
+#endif /* RADEON_EMULATE_LOOPS_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
index d593b3e..1dc1685 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
@@ -368,6 +368,24 @@
 		.NumSrcRegs = 0
 	},
 	{
+		.Opcode = RC_OPCODE_BGNLOOP,
+		.Name = "BGNLOOP",
+		.IsFlowControl = 1,
+		.NumSrcRegs = 0
+	},
+	{
+		.Opcode = RC_OPCODE_BRK,
+		.Name = "BRK",
+		.IsFlowControl = 1,
+		.NumSrcRegs = 0
+	},
+	{
+		.Opcode = RC_OPCODE_ENDLOOP,
+		.Name = "ENDLOOP",
+		.IsFlowControl = 1,
+		.NumSrcRegs = 0,
+	},
+	{
 		.Opcode = RC_OPCODE_REPL_ALPHA,
 		.Name = "REPL_ALPHA",
 		.HasDstReg = 1
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
index 87a2e23..91c82ac 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
@@ -180,6 +180,12 @@
 
 	/** branch instruction: has no effect */
 	RC_OPCODE_ENDIF,
+	
+	RC_OPCODE_BGNLOOP,
+
+	RC_OPCODE_BRK,
+
+	RC_OPCODE_ENDLOOP,
 
 	/** special instruction, used in R300-R500 fragment program pair instructions
 	 * indicates that the result of the alpha operation shall be replicated
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index 6992ca5..e4b302b 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -376,13 +376,12 @@
 	ctx->Const.MaxDrawBuffers = 1;
 	ctx->Const.MaxColorAttachments = 1;
 
-	/* currently bogus data */
 	if (r300->options.hw_tcl_enabled) {
-		ctx->Const.VertexProgram.MaxNativeInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
-		ctx->Const.VertexProgram.MaxNativeAluInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
-		ctx->Const.VertexProgram.MaxNativeAttribs = 16;	/* r420 */
+		ctx->Const.VertexProgram.MaxNativeInstructions = 255;
+		ctx->Const.VertexProgram.MaxNativeAluInstructions = 255;
+		ctx->Const.VertexProgram.MaxNativeAttribs = 16;
 		ctx->Const.VertexProgram.MaxNativeTemps = 32;
-		ctx->Const.VertexProgram.MaxNativeParameters = 256;	/* r420 */
+		ctx->Const.VertexProgram.MaxNativeParameters = 256;
 		ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
 	}
 
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.c b/src/mesa/drivers/dri/r600/r700_assembler.c
index 61133e6..88d6b06 100644
--- a/src/mesa/drivers/dri/r600/r700_assembler.c
+++ b/src/mesa/drivers/dri/r600/r700_assembler.c
@@ -6159,7 +6159,7 @@
     }
     if(uNumValidSrc > 0)
     {
-        prelude_cf_ptr     = pAsm->cf_current_alu_clause_ptr;
+        prelude_cf_ptr     = (R700ControlFlowGenericClause*) pAsm->cf_current_alu_clause_ptr;
         pAsm->alu_x_opcode = SQ_CF_INST_ALU;
     }
 
@@ -6279,7 +6279,7 @@
 
         next_ins(pAsm);        
 
-        pAsm->callers[pAsm->unCallerArrayPointer - 1].finale_cf_ptr  = pAsm->cf_current_alu_clause_ptr;
+        pAsm->callers[pAsm->unCallerArrayPointer - 1].finale_cf_ptr  = (R700ControlFlowGenericClause*) pAsm->cf_current_alu_clause_ptr;
         pAsm->callers[pAsm->unCallerArrayPointer - 1].prelude_cf_ptr = prelude_cf_ptr;
         pAsm->alu_x_opcode = SQ_CF_INST_ALU;
     }
diff --git a/src/mesa/drivers/dri/r600/r700_fragprog.c b/src/mesa/drivers/dri/r600/r700_fragprog.c
index 5a90f72..aab1a79 100644
--- a/src/mesa/drivers/dri/r600/r700_fragprog.c
+++ b/src/mesa/drivers/dri/r600/r700_fragprog.c
@@ -563,11 +563,15 @@
 
     /* see if we need any point_sprite replacements, also increase num_interp
      * as there's no vp output for them */
-    for (i = FRAG_ATTRIB_TEX0; i<= FRAG_ATTRIB_TEX7; i++)
+    if (ctx->Point.PointSprite)
     {
-        if(ctx->Point.CoordReplace[i - FRAG_ATTRIB_TEX0] == GL_TRUE) {
-            ui++;
-            point_sprite = GL_TRUE;
+        for (i = FRAG_ATTRIB_TEX0; i<= FRAG_ATTRIB_TEX7; i++)
+        {
+            if (ctx->Point.CoordReplace[i - FRAG_ATTRIB_TEX0] == GL_TRUE)
+            {
+                ui++;
+                point_sprite = GL_TRUE;
+            }
         }
     }
 
@@ -670,8 +674,9 @@
 
     for(i=0; i<8; i++)
     {
+	    GLboolean coord_replace = ctx->Point.PointSprite && ctx->Point.CoordReplace[i];
 	    unBit = 1 << (VERT_RESULT_TEX0 + i);
-	    if((OutputsWritten & unBit) || (ctx->Point.CoordReplace[i] == GL_TRUE))
+	    if ((OutputsWritten & unBit) || coord_replace)
 	    {
 		    ui = pAsm->uiFP_AttributeMap[FRAG_ATTRIB_TEX0 + i];
 		    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
@@ -679,7 +684,7 @@
 			     SEMANTIC_shift, SEMANTIC_mask);
 		    CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
 		    /* ARB_point_sprite */
-		    if(ctx->Point.CoordReplace[i] == GL_TRUE)
+		    if (coord_replace)
 		    {
 			     SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, PT_SPRITE_TEX_bit);
 		    }
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index bcac125..d2b190e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -593,12 +593,7 @@
 	if (!baseImage)
 		return 0;
 
-	/* Check image level against object BaseLevel, but not MaxLevel. MaxLevel is not
-	 * the highest level that can be assigned to the miptree.
-	 */
-	const unsigned maxLevel = texObj->BaseLevel + baseImage->MaxLog2;
-	if (level < texObj->BaseLevel || level > maxLevel
-			|| level > RADEON_MIPTREE_MAX_TEXTURE_LEVELS)
+	if (level < texObj->BaseLevel || level > texObj->MaxLevel)
 		return 0;
 
 	const unsigned levelDiff = level - texObj->BaseLevel;
@@ -620,7 +615,9 @@
 	radeonTexObj *t = radeon_tex_obj(texObj);
 	radeon_texture_image* image = get_radeon_texture_image(texImage);
 
-	/* check image for dimension and level compatibility with texture */
+	/* Since miptree holds only images for levels <BaseLevel..MaxLevel>
+	 * don't allocate the miptree if the teximage won't fit.
+	 */
 	if (!image_matches_texture_obj(texObj, texImage, level))
 		return;
 
diff --git a/src/mesa/drivers/osmesa/Makefile b/src/mesa/drivers/osmesa/Makefile
index ea49a89..c6b4a04 100644
--- a/src/mesa/drivers/osmesa/Makefile
+++ b/src/mesa/drivers/osmesa/Makefile
@@ -20,17 +20,11 @@
 	-I$(TOP)/src/mesa \
 	-I$(TOP)/src/mesa/main
 
-# Standalone osmesa needs to be linked with core Mesa APIs
-ifeq ($(DRIVER_DIRS), osmesa)
 CORE_MESA = \
 	$(TOP)/src/mesa/libmesa.a \
 	$(TOP)/src/mapi/glapi/libglapi.a \
 	$(TOP)/src/glsl/cl/libglslcl.a \
 	$(TOP)/src/glsl/pp/libglslpp.a
-else
-CORE_MESA =
-endif
-
 
 .c.o:
 	$(CC) -c $(INCLUDE_DIRS) $(CFLAGS) $< -o $@
diff --git a/src/mesa/main/arbprogram.h b/src/mesa/main/arbprogram.h
index df16513..787ffd6 100644
--- a/src/mesa/main/arbprogram.h
+++ b/src/mesa/main/arbprogram.h
@@ -27,6 +27,10 @@
 #define ARBPROGRAM_H
 
 
+#include "compiler.h"
+#include "glheader.h"
+
+
 extern void GLAPIENTRY
 _mesa_BindProgram(GLenum target, GLuint id);
 
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 1a2e9b1..48b9904 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -147,6 +147,8 @@
 /**
  * Given a GL_*_ATTACHMENTn token, return a pointer to the corresponding
  * gl_renderbuffer_attachment object.
+ * This function is only used for user-created FB objects, not the
+ * default / window-system FB object.
  * If \p attachment is GL_DEPTH_STENCIL_ATTACHMENT, return a pointer to
  * the depth buffer attachment point.
  */
@@ -156,6 +158,8 @@
 {
    GLuint i;
 
+   assert(fb->Name > 0);
+
    switch (attachment) {
    case GL_COLOR_ATTACHMENT0_EXT:
    case GL_COLOR_ATTACHMENT1_EXT:
@@ -195,6 +199,45 @@
 
 
 /**
+ * As above, but only used for getting attachments of the default /
+ * window-system framebuffer (not user-created framebuffer objects).
+ */
+static struct gl_renderbuffer_attachment *
+_mesa_get_fb0_attachment(GLcontext *ctx, struct gl_framebuffer *fb,
+                         GLenum attachment)
+{
+   assert(fb->Name == 0);
+
+   switch (attachment) {
+   case GL_FRONT_LEFT:
+      return &fb->Attachment[BUFFER_FRONT_LEFT];
+   case GL_FRONT_RIGHT:
+      return &fb->Attachment[BUFFER_FRONT_RIGHT];
+   case GL_BACK_LEFT:
+      return &fb->Attachment[BUFFER_BACK_LEFT];
+   case GL_BACK_RIGHT:
+      return &fb->Attachment[BUFFER_BACK_RIGHT];
+   case GL_AUX0:
+      if (fb->Visual.numAuxBuffers == 1) {
+         return &fb->Attachment[BUFFER_AUX0];
+      }
+      return NULL;
+   case GL_DEPTH_BUFFER:
+      /* fall-through / new in GL 3.0 */
+   case GL_DEPTH_ATTACHMENT_EXT:
+      return &fb->Attachment[BUFFER_DEPTH];
+   case GL_STENCIL_BUFFER:
+      /* fall-through / new in GL 3.0 */
+   case GL_STENCIL_ATTACHMENT_EXT:
+      return &fb->Attachment[BUFFER_STENCIL];
+   default:
+      return NULL;
+   }
+}
+
+
+
+/**
  * Remove any texture or renderbuffer attached to the given attachment
  * point.  Update reference counts, etc.
  */
@@ -1878,12 +1921,14 @@
    }
 
    if (buffer->Name == 0) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetFramebufferAttachmentParameterivEXT");
-      return;
+      /* the default / window-system FBO */
+      att = _mesa_get_fb0_attachment(ctx, buffer, attachment);
+   }
+   else {
+      /* user-created framebuffer FBO */
+      att = _mesa_get_attachment(ctx, buffer, attachment);
    }
 
-   att = _mesa_get_attachment(ctx, buffer, attachment);
    if (att == NULL) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetFramebufferAttachmentParameterivEXT(attachment)");
diff --git a/src/mesa/main/framebuffer.c b/src/mesa/main/framebuffer.c
index 01f8418..56558cf 100644
--- a/src/mesa/main/framebuffer.c
+++ b/src/mesa/main/framebuffer.c
@@ -879,6 +879,7 @@
          return GL_FALSE;
       }
       ASSERT(_mesa_get_format_bits(ctx->ReadBuffer->_ColorReadBuffer->Format, GL_RED_BITS) > 0 ||
+             _mesa_get_format_bits(ctx->ReadBuffer->_ColorReadBuffer->Format, GL_ALPHA_BITS) > 0 ||
              _mesa_get_format_bits(ctx->ReadBuffer->_ColorReadBuffer->Format, GL_INDEX_BITS) > 0);
       break;
    case GL_DEPTH:
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 2101b9b..8f7ebee 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -1242,8 +1242,6 @@
 
    assert(xoffset % util_format_get_blockwidth(pformat) == 0);
    assert(yoffset % util_format_get_blockheight(pformat) == 0);
-   assert(width % util_format_get_blockwidth(pformat) == 0);
-   assert(height % util_format_get_blockheight(pformat) == 0);
 
    for (y = 0; y < height; y += util_format_get_blockheight(pformat)) {
       /* don't need to adjust for xoffset and yoffset as st_texture_image_map does that */
diff --git a/src/mesa/swrast_setup/ss_triangle.c b/src/mesa/swrast_setup/ss_triangle.c
index bad0d81..f22bc52 100644
--- a/src/mesa/swrast_setup/ss_triangle.c
+++ b/src/mesa/swrast_setup/ss_triangle.c
@@ -159,7 +159,7 @@
 }
 
 #define SS_COLOR(a,b) UNCLAMPED_FLOAT_TO_RGBA_CHAN(a,b)
-#define SS_SPEC(a,b) UNCLAMPED_FLOAT_TO_RGB_CHAN(a,b)
+#define SS_SPEC(a,b) COPY_4V(a,b)
 #define SS_IND(a,b) (a = b)
 
 #define IND (0)