r600g: implement transform feedback

r600: DONE.
r700: MOSTLY (done but locks up).
Evergreen: MOSTLY (done but doesn't work for an unknown reason).

The kernel support will come soon.
diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c
index f6b8631..877e162 100644
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -73,6 +73,35 @@
 			bc->bytecode[id] |= S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
 		id++;
 		break;
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
+	case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
+		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
+			S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
+			S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
+			S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
+		bc->bytecode[id] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
+			S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
+			cf->output.inst |
+			S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask) |
+			S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size);
+		if (bc->chip_class == EVERGREEN) /* no EOP on cayman */
+			bc->bytecode[id] |= S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
+		id++;
+		break;
 	case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
 	case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
 	case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
index 96e8d18..bd1d969 100644
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -1241,3 +1241,38 @@
 
 	ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
 }
+
+void evergreen_flush_vgt_streamout(struct r600_context *ctx)
+{
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = (R_0084FC_CP_STRMOUT_CNTL - EVERGREEN_CONFIG_REG_OFFSET) >> 2;
+	ctx->pm4[ctx->pm4_cdwords++] = 0;
+
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0);
+
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
+	ctx->pm4[ctx->pm4_cdwords++] = R_0084FC_CP_STRMOUT_CNTL >> 2;  /* register */
+	ctx->pm4[ctx->pm4_cdwords++] = 0;
+	ctx->pm4[ctx->pm4_cdwords++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* reference value */
+	ctx->pm4[ctx->pm4_cdwords++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* mask */
+	ctx->pm4[ctx->pm4_cdwords++] = 4; /* poll interval */
+}
+
+void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit)
+{
+	if (buffer_enable_bit) {
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+		ctx->pm4[ctx->pm4_cdwords++] = (R_028B94_VGT_STRMOUT_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+		ctx->pm4[ctx->pm4_cdwords++] = S_028B94_STREAMOUT_0_EN(1);
+
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+		ctx->pm4[ctx->pm4_cdwords++] = (R_028B98_VGT_STRMOUT_BUFFER_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+		ctx->pm4[ctx->pm4_cdwords++] = S_028B98_STREAM_0_BUFFER_EN(buffer_enable_bit);
+	} else {
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+		ctx->pm4[ctx->pm4_cdwords++] = (R_028B94_VGT_STRMOUT_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+		ctx->pm4[ctx->pm4_cdwords++] = S_028B94_STREAMOUT_0_EN(0);
+	}
+}
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index d0c02d5..6f5d6f7 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -927,8 +927,8 @@
 				state->fill_back != PIPE_POLYGON_MODE_FILL);
 	r600_pipe_state_add_reg(rstate, R_028814_PA_SU_SC_MODE_CNTL,
 		S_028814_PROVOKING_VTX_LAST(prov_vtx) |
-		S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
-		S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
+		S_028814_CULL_FRONT(state->rasterizer_discard || (state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
+		S_028814_CULL_BACK(state->rasterizer_discard || (state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
 		S_028814_FACE(!state->front_ccw) |
 		S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) |
 		S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) |
@@ -1688,6 +1688,9 @@
 	rctx->context.sampler_view_destroy = r600_sampler_view_destroy;
 	rctx->context.redefine_user_buffer = u_default_redefine_user_buffer;
 	rctx->context.texture_barrier = evergreen_texture_barrier;
+	rctx->context.create_stream_output_target = r600_create_so_target;
+	rctx->context.stream_output_target_destroy = r600_so_target_destroy;
+	rctx->context.set_stream_output_targets = r600_set_so_targets;
 }
 
 static void cayman_init_config(struct r600_pipe_context *rctx)
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index 6baa2a7..68b77b4 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -50,6 +50,8 @@
 #define EVENT_TYPE_PS_PARTIAL_FLUSH            0x10
 #define EVENT_TYPE_ZPASS_DONE                  0x15
 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT   0x16
+#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH	0x1f
+
 #define		EVENT_TYPE(x)                           ((x) << 0)
 #define		EVENT_INDEX(x)                          ((x) << 8)
                 /* 0 - any non-TS event
@@ -82,6 +84,7 @@
 #define PKT3_MEM_SEMAPHORE                     0x39
 #define PKT3_MPEG_INDEX                        0x3A
 #define PKT3_WAIT_REG_MEM                      0x3C
+#define		WAIT_REG_MEM_EQUAL		3
 #define PKT3_MEM_WRITE                         0x3D
 #define PKT3_INDIRECT_BUFFER                   0x32
 #define PKT3_CP_INTERRUPT                      0x40
@@ -118,6 +121,12 @@
 #define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate))
 
 /* Registers */
+#define R_0084FC_CP_STRMOUT_CNTL		     0x000084FC
+#define   S_0084FC_OFFSET_UPDATE_DONE(x)		(((x) & 0x1) << 0)
+#define R_008960_VGT_STRMOUT_BUFFER_FILLED_SIZE_0    0x008960 /* read-only */
+#define R_008964_VGT_STRMOUT_BUFFER_FILLED_SIZE_1    0x008964 /* read-only */
+#define R_008968_VGT_STRMOUT_BUFFER_FILLED_SIZE_2    0x008968 /* read-only */
+#define R_00896C_VGT_STRMOUT_BUFFER_FILLED_SIZE_3    0x00896C /* read-only */
 #define R_008C00_SQ_CONFIG                           0x00008C00
 #define   S_008C00_VC_ENABLE(x)                        (((x) & 0x1) << 0)
 #define   G_008C00_VC_ENABLE(x)                        (((x) >> 0) & 0x1)
@@ -1723,6 +1732,33 @@
 #define R_028AC0_DB_SRESULTS_COMPARE_STATE0          0x00028AC0
 #define R_028AC4_DB_SRESULTS_COMPARE_STATE1          0x00028AC4
 #define R_028AC8_DB_PRELOAD_CONTROL                  0x00028AC8
+#define R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0	     0x028AD0
+#define R_028AD4_VGT_STRMOUT_VTX_STRIDE_0	     0x028AD4
+#define R_028AD8_VGT_STRMOUT_BUFFER_BASE_0	     0x028AD8
+#define R_028ADC_VGT_STRMOUT_BUFFER_OFFSET_0	     0x028ADC
+#define R_028AE0_VGT_STRMOUT_BUFFER_SIZE_1	     0x028AE0
+#define R_028AE4_VGT_STRMOUT_VTX_STRIDE_1	     0x028AE4
+#define R_028AE8_VGT_STRMOUT_BUFFER_BASE_1	     0x028AE8
+#define R_028AEC_VGT_STRMOUT_BUFFER_OFFSET_1	     0x028AEC
+#define R_028AF0_VGT_STRMOUT_BUFFER_SIZE_2	     0x028AF0
+#define R_028AF4_VGT_STRMOUT_VTX_STRIDE_2	     0x028AF4
+#define R_028AF8_VGT_STRMOUT_BUFFER_BASE_2	     0x028AF8
+#define R_028AFC_VGT_STRMOUT_BUFFER_OFFSET_2	     0x028AFC
+#define R_028B00_VGT_STRMOUT_BUFFER_SIZE_3	     0x028B00
+#define R_028B04_VGT_STRMOUT_VTX_STRIDE_3	     0x028B04
+#define R_028B08_VGT_STRMOUT_BUFFER_BASE_3	     0x028B08
+#define R_028B0C_VGT_STRMOUT_BUFFER_OFFSET_3	     0x028B0C
+#define R_028B10_VGT_STRMOUT_BASE_OFFSET_0	     0x028B10
+#define R_028B14_VGT_STRMOUT_BASE_OFFSET_1	     0x028B14
+#define R_028B18_VGT_STRMOUT_BASE_OFFSET_2	     0x028B18
+#define R_028B1C_VGT_STRMOUT_BASE_OFFSET_3	     0x028B1C
+#define R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET	     0x028B28
+#define R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE 0x028B2C
+#define R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE 0x028B30
+#define R_028B44_VGT_STRMOUT_BASE_OFFSET_HI_0	     0x028B44
+#define R_028B48_VGT_STRMOUT_BASE_OFFSET_HI_1	     0x028B48
+#define R_028B4C_VGT_STRMOUT_BASE_OFFSET_HI_2	     0x028B4C
+#define R_028B50_VGT_STRMOUT_BASE_OFFSET_HI_3	     0x028B50
 #define R_028B54_VGT_SHADER_STAGES_EN                0x00028B54
 #define R_028B70_DB_ALPHA_TO_MASK                    0x00028B70
 #define R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL       0x00028B78
@@ -1750,7 +1786,16 @@
 #define   G_028B8C_OFFSET(x)                           (((x) >> 0) & 0xFFFFFFFF)
 #define   C_028B8C_OFFSET                              0x00000000
 #define R_028B94_VGT_STRMOUT_CONFIG                  0x00028B94
+#define   S_028B94_STREAMOUT_0_EN(x)			(((x) & 0x1) << 0)
+#define   S_028B94_STREAMOUT_1_EN(x)			(((x) & 0x1) << 1)
+#define   S_028B94_STREAMOUT_2_EN(x)			(((x) & 0x1) << 2)
+#define   S_028B94_STREAMOUT_3_EN(x)			(((x) & 0x1) << 3)
+#define   S_028B94_RAST_STREAM(x)			(((x) & 0x7) << 4)
 #define R_028B98_VGT_STRMOUT_BUFFER_CONFIG           0x00028B98
+#define   S_028B98_STREAM_0_BUFFER_EN(x)		(((x) & 0xf) << 0)
+#define   S_028B98_STREAM_1_BUFFER_EN(x)		(((x) & 0xf) << 4)
+#define   S_028B98_STREAM_2_BUFFER_EN(x)		(((x) & 0xf) << 8)
+#define   S_028B98_STREAM_3_BUFFER_EN(x)		(((x) & 0xf) << 12)
 #define R_028C00_PA_SC_LINE_CNTL                     0x00028C00
 #define R_028C04_PA_SC_AA_CONFIG                     0x00028C04
 #define R_028C08_PA_SU_VTX_CNTL                      0x00028C08
diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h
index e697406..fbd12fb 100644
--- a/src/gallium/drivers/r600/r600.h
+++ b/src/gallium/drivers/r600/r600.h
@@ -167,6 +167,7 @@
 	union {
 		uint64_t			u64;
 		boolean				b;
+		struct pipe_query_data_so_statistics so;
 	} result;
 	/* The kind of query */
 	unsigned				type;
@@ -187,6 +188,15 @@
 	struct list_head			list;
 };
 
+struct r600_so_target {
+	struct pipe_stream_output_target b;
+
+	/* The buffer where BUFFER_FILLED_SIZE is stored. */
+	struct r600_resource	*filled_size;
+	unsigned		stride;
+	unsigned		so_index;
+};
+
 #define R600_CONTEXT_DRAW_PENDING	(1 << 0)
 #define R600_CONTEXT_DST_CACHES_DIRTY	(1 << 1)
 #define R600_CONTEXT_CHECK_EVENT_FLUSH	(1 << 2)
@@ -218,6 +228,7 @@
 	/* The list of active queries. Only one query of each type can be active. */
 	struct list_head	active_query_list;
 	unsigned		num_cs_dw_queries_suspend;
+	unsigned		num_cs_dw_streamout_end;
 
 	unsigned		backend_mask;
 	unsigned                max_db; /* for OQ */
@@ -229,6 +240,12 @@
 	struct r600_range fs_resources;
 	int num_ps_resources, num_vs_resources, num_fs_resources;
 	boolean			have_depth_texture, have_depth_fb;
+
+	unsigned			num_so_targets;
+	struct r600_so_target		*so_targets[PIPE_MAX_SO_BUFFERS];
+	boolean				streamout_start;
+	unsigned			streamout_append_bitmask;
+	unsigned			*vs_shader_so_strides;
 };
 
 struct r600_draw {
@@ -268,6 +285,10 @@
 void r600_context_flush_all(struct r600_context *ctx, unsigned flush_flags);
 void r600_context_flush_dest_caches(struct r600_context *ctx);
 
+void r600_context_streamout_begin(struct r600_context *ctx);
+void r600_context_streamout_end(struct r600_context *ctx);
+void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t);
+
 int evergreen_context_init(struct r600_context *ctx, struct r600_screen *screen);
 void evergreen_context_draw(struct r600_context *ctx, const struct r600_draw *draw);
 void evergreen_context_flush_dest_caches(struct r600_context *ctx);
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 1ab16f2..e617b16 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -1672,6 +1672,21 @@
 			cf->output.inst |
 			S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
 		break;
+	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
+	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
+	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
+	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
+		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
+			S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
+			S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
+			S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
+		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
+			S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
+			cf->output.inst |
+			S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program) |
+			S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) |
+			S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask);
+		break;
 	case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
 	case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
 	case V_SQ_CF_WORD1_SQ_CF_INST_POP:
@@ -1730,6 +1745,22 @@
 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
@@ -1760,6 +1791,10 @@
 			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
+			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
+			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
+			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
+			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
 			case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
 			case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
 			case V_SQ_CF_WORD1_SQ_CF_INST_POP:
@@ -1849,6 +1884,22 @@
 				break;
 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
@@ -1923,6 +1974,10 @@
 				break;
 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
+			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
+			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
+			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
+			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
@@ -2057,6 +2112,44 @@
 				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
 				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
 				break;
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
+			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
+				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i_BUF%i ", id, bc->bytecode[id],
+					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) / 4,
+					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) % 4);
+				fprintf(stderr, "GPR:%X ", cf->output.gpr);
+				fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size);
+				fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base);
+				fprintf(stderr, "TYPE:%X\n", cf->output.type);
+				id++;
+				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i_BUF%i ", id, bc->bytecode[id],
+					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) / 4,
+					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) % 4);
+				fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size);
+				fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask);
+				fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
+				fprintf(stderr, "INST:%d ", cf->output.inst);
+				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
+				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
+				break;
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
@@ -2125,6 +2218,28 @@
 				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
 				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
 				break;
+			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
+			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
+			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
+			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
+				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i ", id, bc->bytecode[id],
+					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0));
+				fprintf(stderr, "GPR:%X ", cf->output.gpr);
+				fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size);
+				fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base);
+				fprintf(stderr, "TYPE:%X\n", cf->output.type);
+				id++;
+				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i ", id, bc->bytecode[id],
+					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0));
+				fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size);
+				fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask);
+				fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
+				fprintf(stderr, "INST:%d ", cf->output.inst);
+				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
+				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
+				break;
 			case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
 			case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
 			case V_SQ_CF_WORD1_SQ_CF_INST_POP:
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index 0fd4467..d0ff75d 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -107,6 +107,8 @@
 
 struct r600_bytecode_output {
 	unsigned			array_base;
+	unsigned			array_size;
+	unsigned			comp_mask;
 	unsigned			type;
 	unsigned			end_of_program;
 
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 9326dc6..313ed12 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -65,6 +65,8 @@
 	util_blitter_save_vertex_buffers(rctx->blitter,
 					 rctx->vbuf_mgr->nr_vertex_buffers,
 					 rctx->vbuf_mgr->vertex_buffer);
+	util_blitter_save_so_targets(rctx->blitter, rctx->ctx.num_so_targets,
+				     (struct pipe_stream_output_target**)rctx->ctx.so_targets);
 
 	if (op & R600_SAVE_FRAMEBUFFER)
 		util_blitter_save_framebuffer(rctx->blitter, &rctx->framebuffer);
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index 52e0be7..1dba966 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -943,6 +943,9 @@
 	/* Count in queries_suspend. */
 	num_dw += ctx->num_cs_dw_queries_suspend;
 
+	/* Count in streamout_end at the end of CS. */
+	num_dw += ctx->num_cs_dw_streamout_end;
+
 	/* Count in render_condition(NULL) at the end of CS. */
 	if (ctx->predicate_drawing) {
 		num_dw += 3;
@@ -1471,6 +1474,12 @@
 		r600_context_block_resource_emit_dirty(ctx, dirty_block);
 	}
 
+	/* Enable stream out if needed. */
+	if (ctx->streamout_start) {
+		r600_context_streamout_begin(ctx);
+		ctx->streamout_start = FALSE;
+	}
+
 	/* draw packet */
 	pm4 = &ctx->pm4[ctx->pm4_cdwords];
 
@@ -1503,6 +1512,7 @@
 {
 	struct r600_block *enable_block = NULL;
 	bool queries_suspended = false;
+	bool streamout_suspended = false;
 
 	if (ctx->pm4_cdwords == ctx->init_dwords)
 		return;
@@ -1513,6 +1523,11 @@
 		queries_suspended = true;
 	}
 
+	if (ctx->num_cs_dw_streamout_end) {
+		r600_context_streamout_end(ctx);
+		streamout_suspended = true;
+	}
+
 	if (ctx->screen->chip_class >= EVERGREEN)
 		evergreen_context_flush_dest_caches(ctx);
 	else
@@ -1542,6 +1557,11 @@
 
 	r600_init_cs(ctx);
 
+	if (streamout_suspended) {
+		ctx->streamout_start = TRUE;
+		ctx->streamout_append_bitmask = ~0;
+	}
+
 	/* resume queries */
 	if (queries_suspended) {
 		r600_context_queries_resume(ctx);
@@ -1636,6 +1656,44 @@
 			results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
 		}
 		break;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+		/* SAMPLE_STREAMOUTSTATS stores this structure:
+		 * {
+		 *    u64 NumPrimitivesWritten;
+		 *    u64 PrimitiveStorageNeeded;
+		 * }
+		 * We only need NumPrimitivesWritten here. */
+		while (results_base != query->results_end) {
+			query->result.u64 +=
+				r600_query_read_result(map + results_base, 2, 6, true);
+			results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+		}
+		break;
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+		/* Here we read PrimitiveStorageNeeded. */
+		while (results_base != query->results_end) {
+			query->result.u64 +=
+				r600_query_read_result(map + results_base, 0, 4, true);
+			results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+		}
+		break;
+	case PIPE_QUERY_SO_STATISTICS:
+		while (results_base != query->results_end) {
+			query->result.so.num_primitives_written +=
+				r600_query_read_result(map + results_base, 2, 6, true);
+			query->result.so.primitives_storage_needed +=
+				r600_query_read_result(map + results_base, 0, 4, true);
+			results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+		}
+		break;
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		while (results_base != query->results_end) {
+			query->result.b = query->result.b ||
+				r600_query_read_result(map + results_base, 2, 6, true) !=
+				r600_query_read_result(map + results_base, 0, 4, true);
+			results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+		}
+		break;
 	default:
 		assert(0);
 	}
@@ -1679,6 +1737,15 @@
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 		break;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+	case PIPE_QUERY_SO_STATISTICS:
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		results = ctx->ws->buffer_map(query->buffer->buf, ctx->cs, PIPE_TRANSFER_WRITE);
+		results = (u32*)((char*)results + query->results_end);
+		memset(results, 0, query->result_size);
+		ctx->ws->buffer_unmap(query->buffer->buf);
+		break;
 	default:
 		assert(0);
 	}
@@ -1692,6 +1759,15 @@
 		ctx->pm4[ctx->pm4_cdwords++] = query->results_end;
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 		break;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+	case PIPE_QUERY_SO_STATISTICS:
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+		ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
+		ctx->pm4[ctx->pm4_cdwords++] = query->results_end;
+		ctx->pm4[ctx->pm4_cdwords++] = 0;
+		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
 		ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
@@ -1720,6 +1796,15 @@
 		ctx->pm4[ctx->pm4_cdwords++] = query->results_end + 8;
 		ctx->pm4[ctx->pm4_cdwords++] = 0;
 		break;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+	case PIPE_QUERY_SO_STATISTICS:
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+		ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
+		ctx->pm4[ctx->pm4_cdwords++] = query->results_end + query->result_size/2;
+		ctx->pm4[ctx->pm4_cdwords++] = 0;
+		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
 		ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
@@ -1798,6 +1883,14 @@
 		query->result_size = 16;
 		query->num_cs_dw = 8;
 		break;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+	case PIPE_QUERY_SO_STATISTICS:
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+		query->result_size = 32;
+		query->num_cs_dw = 6;
+		break;
 	default:
 		assert(0);
 		FREE(query);
@@ -1832,20 +1925,28 @@
 {
 	boolean *result_b = (boolean*)vresult;
 	uint64_t *result_u64 = (uint64_t*)vresult;
+	struct pipe_query_data_so_statistics *result_so =
+		(struct pipe_query_data_so_statistics*)vresult;
 
 	if (!r600_query_result(ctx, query, wait))
 		return FALSE;
 
 	switch (query->type) {
 	case PIPE_QUERY_OCCLUSION_COUNTER:
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
 		*result_u64 = query->result.u64;
 		break;
 	case PIPE_QUERY_OCCLUSION_PREDICATE:
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 		*result_b = query->result.b;
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 		*result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
 		break;
+	case PIPE_QUERY_SO_STATISTICS:
+		*result_so = query->result.so;
+		break;
 	default:
 		assert(0);
 	}
@@ -1872,3 +1973,237 @@
 		r600_query_begin(ctx, query);
 	}
 }
+
+static void r600_flush_vgt_streamout(struct r600_context *ctx)
+{
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = (R_008490_CP_STRMOUT_CNTL - R600_CONFIG_REG_OFFSET) >> 2;
+	ctx->pm4[ctx->pm4_cdwords++] = 0;
+
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0);
+
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
+	ctx->pm4[ctx->pm4_cdwords++] = R_008490_CP_STRMOUT_CNTL >> 2;  /* register */
+	ctx->pm4[ctx->pm4_cdwords++] = 0;
+	ctx->pm4[ctx->pm4_cdwords++] = S_008490_OFFSET_UPDATE_DONE(1); /* reference value */
+	ctx->pm4[ctx->pm4_cdwords++] = S_008490_OFFSET_UPDATE_DONE(1); /* mask */
+	ctx->pm4[ctx->pm4_cdwords++] = 4; /* poll interval */
+}
+
+static void r600_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit)
+{
+	if (buffer_enable_bit) {
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+		ctx->pm4[ctx->pm4_cdwords++] = (R_028AB0_VGT_STRMOUT_EN - R600_CONTEXT_REG_OFFSET) >> 2;
+		ctx->pm4[ctx->pm4_cdwords++] = S_028AB0_STREAMOUT(1);
+
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+		ctx->pm4[ctx->pm4_cdwords++] = (R_028B20_VGT_STRMOUT_BUFFER_EN - R600_CONTEXT_REG_OFFSET) >> 2;
+		ctx->pm4[ctx->pm4_cdwords++] = buffer_enable_bit;
+	} else {
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+		ctx->pm4[ctx->pm4_cdwords++] = (R_028AB0_VGT_STRMOUT_EN - R600_CONTEXT_REG_OFFSET) >> 2;
+		ctx->pm4[ctx->pm4_cdwords++] = S_028AB0_STREAMOUT(0);
+	}
+}
+
+void r600_context_streamout_begin(struct r600_context *ctx)
+{
+	struct r600_so_target **t = ctx->so_targets;
+	unsigned *strides = ctx->vs_shader_so_strides;
+	unsigned buffer_en, i, update_flags = 0;
+
+	buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) |
+		    (ctx->num_so_targets >= 2 && t[1] ? 2 : 0) |
+		    (ctx->num_so_targets >= 3 && t[2] ? 4 : 0) |
+		    (ctx->num_so_targets >= 4 && t[3] ? 8 : 0);
+
+	ctx->num_cs_dw_streamout_end =
+		12 + /* flush_vgt_streamout */
+		util_bitcount(buffer_en) * 8 +
+		8;
+
+	r600_need_cs_space(ctx,
+			   12 + /* flush_vgt_streamout */
+			   6 + /* enables */
+			   util_bitcount(buffer_en & ctx->streamout_append_bitmask) * 8 +
+			   util_bitcount(buffer_en & ~ctx->streamout_append_bitmask) * 6 +
+			   (ctx->screen->family > CHIP_R600 && ctx->screen->family < CHIP_RV770 ? 2 : 0) +
+			   ctx->num_cs_dw_streamout_end, TRUE);
+
+	if (ctx->screen->chip_class >= EVERGREEN) {
+		evergreen_flush_vgt_streamout(ctx);
+		evergreen_set_streamout_enable(ctx, buffer_en);
+	} else {
+		r600_flush_vgt_streamout(ctx);
+		r600_set_streamout_enable(ctx, buffer_en);
+	}
+
+	for (i = 0; i < ctx->num_so_targets; i++) {
+		if (t[i]) {
+			t[i]->stride = strides[i];
+			t[i]->so_index = i;
+
+			update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i);
+
+			ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 3, 0);
+			ctx->pm4[ctx->pm4_cdwords++] = (R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 +
+							16*i - R600_CONTEXT_REG_OFFSET) >> 2;
+			ctx->pm4[ctx->pm4_cdwords++] = (t[i]->b.buffer_offset +
+							t[i]->b.buffer_size) >> 2; /* BUFFER_SIZE (in DW) */
+			ctx->pm4[ctx->pm4_cdwords++] = strides[i] >> 2;		   /* VTX_STRIDE (in DW) */
+			ctx->pm4[ctx->pm4_cdwords++] = 0;			   /* BUFFER_BASE */
+
+			ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+			ctx->pm4[ctx->pm4_cdwords++] =
+				r600_context_bo_reloc(ctx, r600_resource(t[i]->b.buffer),
+						      RADEON_USAGE_WRITE);
+
+			if (ctx->streamout_append_bitmask & (1 << i)) {
+				/* Append. */
+				ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
+				ctx->pm4[ctx->pm4_cdwords++] = STRMOUT_SELECT_BUFFER(i) |
+							       STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM); /* control */
+				ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+				ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+				ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address lo */
+				ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address hi */
+
+				ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+				ctx->pm4[ctx->pm4_cdwords++] =
+					r600_context_bo_reloc(ctx,  t[i]->filled_size,
+							      RADEON_USAGE_READ);
+			} else {
+				/* Start from the beginning. */
+				ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
+				ctx->pm4[ctx->pm4_cdwords++] = STRMOUT_SELECT_BUFFER(i) |
+							       STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET); /* control */
+				ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+				ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+				ctx->pm4[ctx->pm4_cdwords++] = t[i]->b.buffer_offset >> 2; /* buffer offset in DW */
+				ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+			}
+		}
+	}
+
+	if (ctx->screen->family > CHIP_R600 && ctx->screen->family < CHIP_RV770) {
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SURFACE_BASE_UPDATE, 0, 0);
+		ctx->pm4[ctx->pm4_cdwords++] = update_flags;
+	}
+}
+
+void r600_context_streamout_end(struct r600_context *ctx)
+{
+	struct r600_so_target **t = ctx->so_targets;
+	unsigned i, flush_flags = 0;
+
+	if (ctx->screen->chip_class >= EVERGREEN) {
+		evergreen_flush_vgt_streamout(ctx);
+	} else {
+		r600_flush_vgt_streamout(ctx);
+	}
+
+	for (i = 0; i < ctx->num_so_targets; i++) {
+		if (t[i]) {
+			ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
+			ctx->pm4[ctx->pm4_cdwords++] = STRMOUT_SELECT_BUFFER(i) |
+						       STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
+						       STRMOUT_STORE_BUFFER_FILLED_SIZE; /* control */
+			ctx->pm4[ctx->pm4_cdwords++] = 0; /* dst address lo */
+			ctx->pm4[ctx->pm4_cdwords++] = 0; /* dst address hi */
+			ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+			ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+
+			ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+			ctx->pm4[ctx->pm4_cdwords++] =
+				r600_context_bo_reloc(ctx,  t[i]->filled_size,
+						      RADEON_USAGE_WRITE);
+
+			flush_flags |= S_0085F0_SO0_DEST_BASE_ENA(1) << i;
+		}
+	}
+
+	if (ctx->screen->chip_class >= EVERGREEN) {
+		evergreen_set_streamout_enable(ctx, 0);
+	} else {
+		r600_set_streamout_enable(ctx, 0);
+	}
+
+	if (ctx->screen->family < CHIP_RV770) {
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+		ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0);
+	} else {
+		ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SURFACE_SYNC, 3, 0);
+		ctx->pm4[ctx->pm4_cdwords++] = flush_flags;     /* CP_COHER_CNTL */
+		ctx->pm4[ctx->pm4_cdwords++] = 0xffffffff;      /* CP_COHER_SIZE */
+		ctx->pm4[ctx->pm4_cdwords++] = 0;               /* CP_COHER_BASE */
+		ctx->pm4[ctx->pm4_cdwords++] = 0x0000000A;      /* POLL_INTERVAL */
+	}
+
+	ctx->num_cs_dw_streamout_end = 0;
+
+	/* XXX print some debug info */
+	for (i = 0; i < ctx->num_so_targets; i++) {
+		if (!t[i])
+			continue;
+
+		uint32_t *ptr = ctx->ws->buffer_map(t[i]->filled_size->buf, ctx->cs, RADEON_USAGE_READ);
+		printf("FILLED_SIZE%i: %u\n", i, *ptr);
+		ctx->ws->buffer_unmap(t[i]->filled_size->buf);
+	}
+}
+
+void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
+{
+	r600_need_cs_space(ctx, 14 + 21, TRUE);
+
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - R600_CONTEXT_REG_OFFSET) >> 2;
+	ctx->pm4[ctx->pm4_cdwords++] = 0;
+
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - R600_CONTEXT_REG_OFFSET) >> 2;
+	ctx->pm4[ctx->pm4_cdwords++] = t->stride >> 2;
+
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_COPY_DW, 4, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
+	ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address lo */
+	ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address hi */
+	ctx->pm4[ctx->pm4_cdwords++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
+	ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = r600_context_bo_reloc(ctx,  t->filled_size,
+							     RADEON_USAGE_READ);
+
+#if 0 /* I have not found this useful yet. */
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_COPY_DW, 4, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG;
+	ctx->pm4[ctx->pm4_cdwords++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */
+	ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+	ctx->pm4[ctx->pm4_cdwords++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */
+	ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = (R_0085F0_CP_COHER_CNTL - R600_CONFIG_REG_OFFSET) >> 2;
+	ctx->pm4[ctx->pm4_cdwords++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index;
+
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = (R_0085F8_CP_COHER_BASE - R600_CONFIG_REG_OFFSET) >> 2;
+	ctx->pm4[ctx->pm4_cdwords++] = t->b.buffer_offset >> 2;
+
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = r600_context_bo_reloc(ctx, (struct r600_resource*)t->b.buffer,
+							     RADEON_USAGE_WRITE);
+
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
+	ctx->pm4[ctx->pm4_cdwords++] = R_0085FC_CP_COHER_STATUS >> 2;  /* register */
+	ctx->pm4[ctx->pm4_cdwords++] = 0;
+	ctx->pm4[ctx->pm4_cdwords++] = 0; /* reference value */
+	ctx->pm4[ctx->pm4_cdwords++] = 0xffffffff; /* mask */
+	ctx->pm4[ctx->pm4_cdwords++] = 4; /* poll interval */
+#endif
+}
diff --git a/src/gallium/drivers/r600/r600_hw_context_priv.h b/src/gallium/drivers/r600/r600_hw_context_priv.h
index bea6135..206de7e 100644
--- a/src/gallium/drivers/r600/r600_hw_context_priv.h
+++ b/src/gallium/drivers/r600/r600_hw_context_priv.h
@@ -76,6 +76,13 @@
 void r600_init_cs(struct r600_context *ctx);
 int r600_resource_init(struct r600_context *ctx, struct r600_range *range, unsigned offset, unsigned nblocks, unsigned stride, struct r600_reg *reg, int nreg, unsigned offset_base);
 
+/*
+ * evergreen_hw_context.c
+ */
+void evergreen_flush_vgt_streamout(struct r600_context *ctx);
+void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit);
+
+
 static INLINE unsigned r600_context_bo_reloc(struct r600_context *ctx, struct r600_resource *rbo,
 					     enum radeon_bo_usage usage)
 {
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 97c6808..22fefaa 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -378,6 +378,7 @@
 	case PIPE_CAP_PRIMITIVE_RESTART:
 	case PIPE_CAP_CONDITIONAL_RENDER:
 	case PIPE_CAP_TEXTURE_BARRIER:
+	case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
 		return 1;
 
 	/* Supported except the original R600. */
@@ -391,17 +392,21 @@
 		return family >= CHIP_CEDAR ? 1 : 0;
 
 	/* Unsupported features. */
-	case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
-	case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_ATTRIBS:
-	case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
-	case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
-	case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
 	case PIPE_CAP_TGSI_INSTANCEID:
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
 	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
 	case PIPE_CAP_SCALED_RESOLVE:
 		return 0;
 
+	/* Stream output. */
+	case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+		return debug_get_bool_option("R600_STREAMOUT", FALSE) ? 4 : 0;
+	case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_ATTRIBS:
+		return 16;
+	case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+	case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+		return 16*4;
+
 	/* Texturing. */
 	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
 	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index a127eed..46443af 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -133,6 +133,8 @@
 	struct r600_vertex_element	vertex_elements;
 	struct tgsi_token		*tokens;
 	unsigned	sprite_coord_enable;
+	struct pipe_stream_output_info	so;
+	unsigned			so_strides[4];
 };
 
 struct r600_pipe_sampler_state {
@@ -348,6 +350,19 @@
 void r600_delete_vs_shader(struct pipe_context *ctx, void *state);
 void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index,
 			      struct pipe_resource *buffer);
+struct pipe_stream_output_target *
+r600_create_so_target(struct pipe_context *ctx,
+		      struct pipe_resource *buffer,
+		      unsigned buffer_offset,
+		      unsigned buffer_size);
+void r600_so_target_destroy(struct pipe_context *ctx,
+			    struct pipe_stream_output_target *target);
+void r600_set_so_targets(struct pipe_context *ctx,
+			 unsigned num_targets,
+			 struct pipe_stream_output_target **targets,
+			 unsigned append_bitmask);
+
+
 void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info);
 
 /*
diff --git a/src/gallium/drivers/r600/r600_query.c b/src/gallium/drivers/r600/r600_query.c
index ec0d91f..ee2d04b 100644
--- a/src/gallium/drivers/r600/r600_query.c
+++ b/src/gallium/drivers/r600/r600_query.c
@@ -100,7 +100,21 @@
 	}
 
 	rctx->ctx.predicate_drawing = true;
-	r600_query_predication(&rctx->ctx, rquery, PREDICATION_OP_ZPASS, wait_flag);
+
+	switch (rquery->type) {
+	case PIPE_QUERY_OCCLUSION_COUNTER:
+	case PIPE_QUERY_OCCLUSION_PREDICATE:
+		r600_query_predication(&rctx->ctx, rquery, PREDICATION_OP_ZPASS, wait_flag);
+		break;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+	case PIPE_QUERY_SO_STATISTICS:
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		r600_query_predication(&rctx->ctx, rquery, PREDICATION_OP_PRIMCOUNT, wait_flag);
+		break;
+	default:
+		assert(0);
+	}
 }
 
 void r600_init_query_functions(struct r600_pipe_context *rctx)
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 61f88f4..ad4aded 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -119,6 +119,19 @@
 	if (dump_shaders) {
 		fprintf(stderr, "--------------------------------------------------------------\n");
 		tgsi_dump(shader->tokens, 0);
+
+		if (shader->so.num_outputs) {
+			unsigned i;
+			fprintf(stderr, "STREAMOUT\n");
+			for (i = 0; i < shader->so.num_outputs; i++) {
+				fprintf(stderr, "  %i: MEM_STREAM0_BUF%i OUT[%i].%s%s%s%s\n", i,
+					shader->so.output[i].output_buffer, shader->so.output[i].register_index,
+				        shader->so.output[i].register_mask & 1 ? "x" : "_",
+				        (shader->so.output[i].register_mask >> 1) & 1 ? "y" : "_",
+				        (shader->so.output[i].register_mask >> 2) & 1 ? "z" : "_",
+				        (shader->so.output[i].register_mask >> 3) & 1 ? "w" : "_");
+			}
+		}
 	}
 	r = r600_shader_from_tgsi(rctx, shader);
 	if (r) {
@@ -681,6 +694,7 @@
 {
 	struct r600_shader *shader = &pipeshader->shader;
 	struct tgsi_token *tokens = pipeshader->tokens;
+	struct pipe_stream_output_info so = pipeshader->so;
 	struct tgsi_full_immediate *immediate;
 	struct tgsi_full_property *property;
 	struct r600_shader_ctx ctx;
@@ -847,6 +861,93 @@
 		}
 	}
 
+	/* Add stream outputs. */
+	if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
+		unsigned buffer_offset[PIPE_MAX_SO_BUFFERS] = {0};
+
+		for (i = 0; i < so.num_outputs; i++) {
+			struct r600_bytecode_output output;
+			unsigned comps;
+
+			if (so.output[i].output_buffer >= 4) {
+				R600_ERR("exceeded the max number of stream output buffers, got: %d\n",
+					 so.output[i].output_buffer);
+				r = -EINVAL;
+				goto out_err;
+			}
+
+			switch (so.output[i].register_mask) {
+			case TGSI_WRITEMASK_XYZW:
+				comps = 4;
+				break;
+			case TGSI_WRITEMASK_XYZ:
+				comps = 3;
+				break;
+			case TGSI_WRITEMASK_XY:
+				comps = 2;
+				break;
+			case TGSI_WRITEMASK_X:
+				comps = 1;
+				break;
+			default:
+				R600_ERR("streamout: invalid register_mask, got: %x\n",
+					 so.output[i].register_mask);
+				r = -EINVAL;
+				goto out_err;
+			}
+
+			memset(&output, 0, sizeof(struct r600_bytecode_output));
+			output.gpr = shader->output[so.output[i].register_index].gpr;
+			output.elem_size = 0;
+			output.array_base = buffer_offset[so.output[i].output_buffer];
+			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
+			output.burst_count = 1;
+			output.barrier = 1;
+			output.array_size = 0;
+			output.comp_mask = so.output[i].register_mask;
+			if (ctx.bc->chip_class >= EVERGREEN) {
+				switch (so.output[i].output_buffer) {
+				case 0:
+					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0;
+					break;
+				case 1:
+					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1;
+					break;
+				case 2:
+					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2;
+					break;
+				case 3:
+					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3;
+					break;
+				}
+			} else {
+				switch (so.output[i].output_buffer) {
+				case 0:
+					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0;
+					break;
+				case 1:
+					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1;
+					break;
+				case 2:
+					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2;
+					break;
+				case 3:
+					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3;
+					break;
+				}
+			}
+			r = r600_bytecode_add_output(ctx.bc, &output);
+			if (r)
+				goto out_err;
+
+			buffer_offset[so.output[i].output_buffer] += comps;
+		}
+
+		for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+			pipeshader->so_strides[i] = buffer_offset[i] * 4;
+		}
+	}
+
 	/* export output */
 	j = 0;
 	for (i = 0, pos0 = 0; i < noutput; i++) {
diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h
index 56ed35e..b9c4126 100644
--- a/src/gallium/drivers/r600/r600_sq.h
+++ b/src/gallium/drivers/r600/r600_sq.h
@@ -114,6 +114,10 @@
 #define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS                 0x00000001
 #define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM               0x00000002
 #define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_SX                  0x00000003
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE               0x00000000
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND           0x00000001
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ                0x00000002
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND            0x00000003
 #define   S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x)                       (((x) & 0x7F) << 15)
 #define   G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x)                       (((x) >> 15) & 0x7F)
 #define   C_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR                          0xFFC07FFF
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 8410cfe..7f44035 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -933,7 +933,7 @@
 }
 
 static void *r600_create_rs_state(struct pipe_context *ctx,
-					const struct pipe_rasterizer_state *state)
+				  const struct pipe_rasterizer_state *state)
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_pipe_rasterizer *rs = CALLOC_STRUCT(r600_pipe_rasterizer);
@@ -978,8 +978,8 @@
 				state->fill_back != PIPE_POLYGON_MODE_FILL);
 	r600_pipe_state_add_reg(rstate, R_028814_PA_SU_SC_MODE_CNTL,
 		S_028814_PROVOKING_VTX_LAST(prov_vtx) |
-		S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
-		S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
+		S_028814_CULL_FRONT(state->rasterizer_discard || (state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
+		S_028814_CULL_BACK(state->rasterizer_discard || (state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
 		S_028814_FACE(!state->front_ccw) |
 		S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) |
 		S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) |
@@ -1758,6 +1758,9 @@
 	rctx->context.sampler_view_destroy = r600_sampler_view_destroy;
 	rctx->context.redefine_user_buffer = u_default_redefine_user_buffer;
 	rctx->context.texture_barrier = r600_texture_barrier;
+	rctx->context.create_stream_output_target = r600_create_so_target;
+	rctx->context.stream_output_target_destroy = r600_so_target_destroy;
+	rctx->context.set_stream_output_targets = r600_set_so_targets;
 }
 
 void r600_adjust_gprs(struct r600_pipe_context *rctx)
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index d6ffda4..9f6f514 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -247,10 +247,11 @@
 void *r600_create_shader_state(struct pipe_context *ctx,
 			       const struct pipe_shader_state *state)
 {
-	struct r600_pipe_shader *shader =  CALLOC_STRUCT(r600_pipe_shader);
+	struct r600_pipe_shader *shader = CALLOC_STRUCT(r600_pipe_shader);
 	int r;
 
 	shader->tokens = tgsi_dup_tokens(state->tokens);
+	shader->so = state->stream_output;
 
 	r =  r600_pipe_shader_create(ctx, shader);
 	if (r) {
@@ -412,6 +413,71 @@
 		pipe_resource_reference((struct pipe_resource**)&rbuffer, NULL);
 }
 
+struct pipe_stream_output_target *
+r600_create_so_target(struct pipe_context *ctx,
+		      struct pipe_resource *buffer,
+		      unsigned buffer_offset,
+		      unsigned buffer_size)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+	struct r600_so_target *t;
+	void *ptr;
+
+	t = CALLOC_STRUCT(r600_so_target);
+	if (!t) {
+		return NULL;
+	}
+
+	t->b.reference.count = 1;
+	t->b.context = ctx;
+	pipe_resource_reference(&t->b.buffer, buffer);
+	t->b.buffer_offset = buffer_offset;
+	t->b.buffer_size = buffer_size;
+
+	t->filled_size = (struct r600_resource*)
+		pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_STATIC, 4);
+	ptr = rctx->ws->buffer_map(t->filled_size->buf, rctx->ctx.cs, PIPE_TRANSFER_WRITE);
+	memset(ptr, 0, t->filled_size->buf->size);
+	rctx->ws->buffer_unmap(t->filled_size->buf);
+
+	return &t->b;
+}
+
+void r600_so_target_destroy(struct pipe_context *ctx,
+			    struct pipe_stream_output_target *target)
+{
+	struct r600_so_target *t = (struct r600_so_target*)target;
+	pipe_resource_reference(&t->b.buffer, NULL);
+	pipe_resource_reference((struct pipe_resource**)&t->filled_size, NULL);
+	FREE(t);
+}
+
+void r600_set_so_targets(struct pipe_context *ctx,
+			 unsigned num_targets,
+			 struct pipe_stream_output_target **targets,
+			 unsigned append_bitmask)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+	unsigned i;
+
+	/* Stop streamout. */
+	if (rctx->ctx.num_so_targets) {
+		r600_context_streamout_end(&rctx->ctx);
+	}
+
+	/* Set the new targets. */
+	for (i = 0; i < num_targets; i++) {
+		pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->ctx.so_targets[i], targets[i]);
+	}
+	for (; i < rctx->ctx.num_so_targets; i++) {
+		pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->ctx.so_targets[i], NULL);
+	}
+
+	rctx->ctx.num_so_targets = num_targets;
+	rctx->ctx.streamout_start = num_targets != 0;
+	rctx->ctx.streamout_append_bitmask = append_bitmask;
+}
+
 static void r600_vertex_buffer_update(struct r600_pipe_context *rctx)
 {
 	struct r600_pipe_resource_state *rstate;
@@ -528,7 +594,7 @@
 	struct pipe_index_buffer ib = {};
 	unsigned prim, mask, ls_mask = 0;
 
-	if (!info.count ||
+	if ((!info.count && (info.indexed || !info.count_from_stream_output)) ||
 	    (info.indexed && !rctx->vbuf_mgr->index_buffer.buffer) ||
 	    !r600_conv_pipe_prim(info.mode, &prim)) {
 		return;
@@ -572,8 +638,15 @@
 	} else {
 		info.index_bias = info.start;
 		rdraw.vgt_draw_initiator = V_0287F0_DI_SRC_SEL_AUTO_INDEX;
+		if (info.count_from_stream_output) {
+			rdraw.vgt_draw_initiator |= S_0287F0_USE_OPAQUE(1);
+
+			r600_context_draw_opaque_count(&rctx->ctx, (struct r600_so_target*)info.count_from_stream_output);
+		}
 	}
 
+	rctx->ctx.vs_shader_so_strides = rctx->vs_shader->so_strides;
+
 	mask = (1ULL << ((unsigned)rctx->framebuffer.nr_cbufs * 4)) - 1;
 
 	if (rctx->vgt.id != R600_PIPE_STATE_VGT) {
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index 7a2fe02..ccdf82e 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -76,10 +76,23 @@
 #define PKT3_DRAW_INDEX_IMMD                   0x2E
 #define PKT3_NUM_INSTANCES                     0x2F
 #define PKT3_STRMOUT_BUFFER_UPDATE             0x34
+#define		STRMOUT_STORE_BUFFER_FILLED_SIZE	1
+#define		STRMOUT_OFFSET_SOURCE(x)	(((x) & 0x3) << 1)
+#define			STRMOUT_OFFSET_FROM_PACKET		0
+#define			STRMOUT_OFFSET_FROM_VGT_FILLED_SIZE	1
+#define			STRMOUT_OFFSET_FROM_MEM			2
+#define			STRMOUT_OFFSET_NONE			3
+#define		STRMOUT_SELECT_BUFFER(x)	(((x) & 0x3) << 8)
 #define PKT3_INDIRECT_BUFFER_MP                0x38
 #define PKT3_MEM_SEMAPHORE                     0x39
 #define PKT3_MPEG_INDEX                        0x3A
+#define PKT3_COPY_DW			       0x3B
+#define		COPY_DW_SRC_IS_REG		(0 << 0)
+#define		COPY_DW_SRC_IS_MEM		(1 << 0)
+#define		COPY_DW_DST_IS_REG		(0 << 1)
+#define		COPY_DW_DST_IS_MEM		(1 << 1)
 #define PKT3_WAIT_REG_MEM                      0x3C
+#define		WAIT_REG_MEM_EQUAL		3
 #define PKT3_MEM_WRITE                         0x3D
 #define PKT3_INDIRECT_BUFFER                   0x32
 #define PKT3_CP_INTERRUPT                      0x40
@@ -106,6 +119,8 @@
 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14
 #define EVENT_TYPE_ZPASS_DONE                  0x15
 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT   0x16
+#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH	0x1f
+#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS	0x20
 #define		EVENT_TYPE(x)                           ((x) << 0)
 #define		EVENT_INDEX(x)                          ((x) << 8)
                 /* 0 - any non-TS event
@@ -147,6 +162,12 @@
 #define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PRED_S(predicate))
 
 /* Registers */
+#define R_008490_CP_STRMOUT_CNTL		     0x008490
+#define   S_008490_OFFSET_UPDATE_DONE(x)		(((x) & 0x1) << 0)
+#define R_008960_VGT_STRMOUT_BUFFER_FILLED_SIZE_0    0x008960 /* read-only */
+#define R_008964_VGT_STRMOUT_BUFFER_FILLED_SIZE_1    0x008964 /* read-only */
+#define R_008968_VGT_STRMOUT_BUFFER_FILLED_SIZE_2    0x008968 /* read-only */
+#define R_00896C_VGT_STRMOUT_BUFFER_FILLED_SIZE_3    0x00896C /* read-only */
 #define R_008C00_SQ_CONFIG                           0x00008C00
 #define   S_008C00_VC_ENABLE(x)                        (((x) & 0x1) << 0)
 #define   G_008C00_VC_ENABLE(x)                        (((x) >> 0) & 0x1)
@@ -3144,6 +3165,26 @@
 #define   S_028AB8_VTX_CNT_EN(x)                       (((x) & 0x1) << 0)
 #define   G_028AB8_VTX_CNT_EN(x)                       (((x) >> 0) & 0x1)
 #define   C_028AB8_VTX_CNT_EN                          0xFFFFFFFE
+#define R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0	     0x028AD0
+#define R_028AD4_VGT_STRMOUT_VTX_STRIDE_0	     0x028AD4
+#define R_028AD8_VGT_STRMOUT_BUFFER_BASE_0	     0x028AD8
+#define R_028ADC_VGT_STRMOUT_BUFFER_OFFSET_0	     0x028ADC
+#define R_028AE0_VGT_STRMOUT_BUFFER_SIZE_1	     0x028AE0
+#define R_028AE4_VGT_STRMOUT_VTX_STRIDE_1	     0x028AE4
+#define R_028AE8_VGT_STRMOUT_BUFFER_BASE_1	     0x028AE8
+#define R_028AEC_VGT_STRMOUT_BUFFER_OFFSET_1	     0x028AEC
+#define R_028AF0_VGT_STRMOUT_BUFFER_SIZE_2	     0x028AF0
+#define R_028AF4_VGT_STRMOUT_VTX_STRIDE_2	     0x028AF4
+#define R_028AF8_VGT_STRMOUT_BUFFER_BASE_2	     0x028AF8
+#define R_028AFC_VGT_STRMOUT_BUFFER_OFFSET_2	     0x028AFC
+#define R_028B00_VGT_STRMOUT_BUFFER_SIZE_3	     0x028B00
+#define R_028B04_VGT_STRMOUT_VTX_STRIDE_3	     0x028B04
+#define R_028B08_VGT_STRMOUT_BUFFER_BASE_3	     0x028B08
+#define R_028B0C_VGT_STRMOUT_BUFFER_OFFSET_3	     0x028B0C
+#define R_028B10_VGT_STRMOUT_BASE_OFFSET_0	     0x028B10
+#define R_028B14_VGT_STRMOUT_BASE_OFFSET_1	     0x028B14
+#define R_028B18_VGT_STRMOUT_BASE_OFFSET_2	     0x028B18
+#define R_028B1C_VGT_STRMOUT_BASE_OFFSET_3	     0x028B1C
 #define R_028B20_VGT_STRMOUT_BUFFER_EN               0x028B20
 #define   S_028B20_BUFFER_0_EN(x)                      (((x) & 0x1) << 0)
 #define   G_028B20_BUFFER_0_EN(x)                      (((x) >> 0) & 0x1)
@@ -3157,6 +3198,13 @@
 #define   S_028B20_BUFFER_3_EN(x)                      (((x) & 0x1) << 3)
 #define   G_028B20_BUFFER_3_EN(x)                      (((x) >> 3) & 0x1)
 #define   C_028B20_BUFFER_3_EN                         0xFFFFFFF7
+#define R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET	     0x028B28
+#define R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE 0x028B2C
+#define R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE 0x028B30
+#define R_028B44_VGT_STRMOUT_BASE_OFFSET_HI_0	     0x028B44
+#define R_028B48_VGT_STRMOUT_BASE_OFFSET_HI_1	     0x028B48
+#define R_028B4C_VGT_STRMOUT_BASE_OFFSET_HI_2	     0x028B4C
+#define R_028B50_VGT_STRMOUT_BASE_OFFSET_HI_3	     0x028B50
 #define R_028C20_PA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX    0x028C20
 #define   S_028C20_S4_X(x)                             (((x) & 0xF) << 0)
 #define   G_028C20_S4_X(x)                             (((x) >> 0) & 0xF)
@@ -3280,6 +3328,9 @@
 #define   S_0085F0_CR2_ACTION_ENA(x)                   (((x) & 0x1) << 31)
 #define   G_0085F0_CR2_ACTION_ENA(x)                   (((x) >> 31) & 0x1)
 #define   C_0085F0_CR2_ACTION_ENA                      0x7FFFFFFF
+#define R_0085F4_CP_COHER_SIZE                       0x0085F4
+#define R_0085F8_CP_COHER_BASE                       0x0085F8
+#define R_0085FC_CP_COHER_STATUS                     0x0085FC
 
 
 #define R_02812C_CB_CLEAR_ALPHA                      0x02812C