r600: Emit EOP for more CF instruction types

So far on pre-cayman chipsets the CF instructions CF_OP_LOOP_END,
CF_OP_CALL_FS, CF_OP_POP, and CF_OP_GDS an extra CF_NOP instruction
was added to add the EOP flag, even though this is not actually
needed, because all these instrutions support the EOP flag.

This patch removes the fixup code, adds setting the EOP flag for the
according instructions as well as others like CF_OP_TEX and CF_OP_VTX,
and adds writing out EOP for this type of instruction in the disassembler.

This also fixes a bug where shaders were created that didn't actually have
the EOP flag set in the last CF instruction, which might have resulted
in GPU lockups.

[airlied: cleaned up a little]
Signed-off-by: Gert Wollny <gw.fossdev@gmail.com>
Cc: <mesa-stable@lists.freedesktop.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c
index ce7e861..8f9d1b8 100644
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -71,10 +71,13 @@
 		} else if (cfop->flags & CF_CLAUSE) {
 			/* CF_TEX/VTX (CF_ALU already handled above) */
 			bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
-			bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(opcode) |
+			bc->bytecode[id] = S_SQ_CF_WORD1_CF_INST(opcode) |
 					S_SQ_CF_WORD1_BARRIER(1) |
 					S_SQ_CF_WORD1_VALID_PIXEL_MODE(cf->vpm) |
 					S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1);
+			if (bc->chip_class == EVERGREEN) /* no EOP on cayman */
+				bc->bytecode[id] |= S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program);
+			id++;
 		} else if (cfop->flags & CF_EXP) {
 			/* EXPORT instructions */
 			bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
@@ -133,12 +136,14 @@
 		} else {
 			/* other instructions */
 			bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
-			bc->bytecode[id++] =  S_SQ_CF_WORD1_CF_INST(opcode)|
+			bc->bytecode[id] = S_SQ_CF_WORD1_CF_INST(opcode) |
 					S_SQ_CF_WORD1_BARRIER(1) |
 					S_SQ_CF_WORD1_COND(cf->cond) |
 					S_SQ_CF_WORD1_POP_COUNT(cf->pop_count) |
-					S_SQ_CF_WORD1_COUNT(cf->count) |
-					S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program);
+					S_SQ_CF_WORD1_COUNT(cf->count);
+			if (bc->chip_class == EVERGREEN) /* no EOP on cayman */
+				bc->bytecode[id] |= S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program);
+			id++;
 		}
 	}
 	return 0;
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 96bc337..69b2d14 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -1625,7 +1625,8 @@
 	*bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
 	*bytecode++ = S_SQ_CF_WORD1_CF_INST(r600_isa_cf_opcode(ISA_CC_R600, cf->op)) |
 			S_SQ_CF_WORD1_BARRIER(1) |
-			S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1);
+			S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1)|
+			S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program);
 }
 
 /* common for r600/r700 - eg in eg_asm.c */
@@ -2097,6 +2098,8 @@
 						bc->bytecode[id + 1], cfop->name);
 				fprintf(stderr, "%d @%d ", cf->ndw / 4, cf->addr);
 				fprintf(stderr, "\n");
+				if (cf->end_of_program)
+					fprintf(stderr, "EOP ");
 			} else if (cfop->flags & CF_EXP) {
 				int o = 0;
 				const char *exp_type[] = {"PIXEL", "POS  ", "PARAM"};
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 1422abf..82b45b6 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -3809,7 +3809,7 @@
 			last = r600_isa_cf(ctx.bc->cf_last->op);
 
 		/* alu clause instructions don't have EOP bit, so add NOP */
-		if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS || ctx.bc->cf_last->op == CF_OP_POP || ctx.bc->cf_last->op == CF_OP_GDS)
+		if (!last || last->flags & CF_ALU)
 			r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
 
 		ctx.bc->cf_last->end_of_program = 1;
diff --git a/src/gallium/drivers/r600/r700_asm.c b/src/gallium/drivers/r600/r700_asm.c
index 04f8c62..395059c 100644
--- a/src/gallium/drivers/r600/r700_asm.c
+++ b/src/gallium/drivers/r600/r700_asm.c
@@ -30,7 +30,8 @@
 	*bytecode++ = S_SQ_CF_WORD1_CF_INST(r600_isa_cf_opcode(ISA_CC_R700, cf->op)) |
 			S_SQ_CF_WORD1_BARRIER(1) |
 			S_SQ_CF_WORD1_COUNT(count) |
-			S_SQ_CF_WORD1_COUNT_3(count >> 3);
+			S_SQ_CF_WORD1_COUNT_3(count >> 3)|
+			S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program);
 }
 
 int r700_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id)