draw: Work around an invalid write.

SSE vertex shader does not seem to honor the execution mask.  Pad the
output array as a workaround.
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index 24c538b..121dfc4 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -184,7 +184,7 @@
    output_verts->count = input_verts->count;
    output_verts->verts =
       (struct vertex_header *)MALLOC(output_verts->vertex_size *
-                                     output_verts->count);
+                                     align(output_verts->count, 4));
 
    vshader->run_linear(vshader,
                        (const float (*)[4])input_verts->verts->data,