intel_perf_counters: Add support for Gen7 platforms.

We finally received permission to release this; the counters should be
properly documented in the Haswell PRMs.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
diff --git a/tools/intel_perf_counters.c b/tools/intel_perf_counters.c
index b528361..23d9ed3 100644
--- a/tools/intel_perf_counters.c
+++ b/tools/intel_perf_counters.c
@@ -137,6 +137,163 @@
 	[28] = "SF active and stalled",
 };
 
+#define GEN7_COUNTER_COUNT 44
+
+/**
+ * Names for aggregating counters A0-A44.  Uninitialized fields are "Reserved."
+ */
+const char *gen7_counter_names[GEN7_COUNTER_COUNT] = {
+	/* A0:
+	 * The sum of all cycles on all cores actively executing instructions
+	 * This does not count the time taken to service Send instructions.
+	 * This time is considered by shader active counters to give the result.
+	 */
+	[0]  = "Aggregated Core Array Active",
+	/* A1:
+	 * The sum of all cycles on all cores where the EU is not idle and is
+	 * not actively executing ISA instructions.  Generally this means that
+	 * all loaded threads on the EU are stalled on some data dependency,
+	 * but this also includes the time during which the TS is loading the
+	 * thread dispatch header into the EU prior to thread execution and no
+	 * other thread is fully loaded.
+	 */
+	[1]  = "Aggregated Core Array Stalled",
+	/* A2:
+	 * Total time in clocks the vertex shader spent active on all cores.
+	 */
+	[2]  = "Vertex Shader Active Time",
+	/* A4:
+	 * Total time in clocks the vertex shader spent stalled on all cores -
+	 * and the entire core was stalled as well.
+	 */
+	[4]  = "Vertex Shader Stall Time - Core Stall",
+	/* A5: Number of VS threads loaded at any given time in the EUs. */
+	[5]  = "# VS threads loaded",
+	/* A7:
+	 * Total time in clocks the Hull shader spent active on all cores.
+	 */
+	[7]  = "Hull Shader Active Time",
+	/* A9:
+	 * Total time in clocks the Hull shader spent stalled on all cores -
+	 * and the entire core was stalled as well.
+	 */
+	[9]  = "Hull Shader Stall Time - Core Stall",
+	/* A10: Number of HS threads loaded at any given time in the EUs. */
+	[10] = "# HS threads loaded",
+	/* A12:
+	 * Total time in clocks the Domain shader spent active on all cores.
+	 */
+	[12] = "Domain Shader Active Time",
+	/* A14:
+	 * Total time in clocks the domain shader spent stalled on all cores -
+	 * and the entire core was stalled as well.
+	 */
+	[14] = "Domain Shader Stall Time - Core Stall",
+	/* A15: Number of DS threads loaded at any given time in the EUs. */
+	[15] = "# DS threads loaded",
+	/* A17:
+	 * Total time in clocks the compute shader spent active on all cores.
+	 */
+	[17] = "Compute Shader Active Time",
+	/* A19:
+	 * Total time in clocks the compute shader spent stalled on all cores -
+	 * and the entire core was stalled as well.
+	 */
+	[19] = "Compute Shader Stall Time - Core Stall",
+	/* A20: Number of CS threads loaded at any given time in the EUs. */
+	[20] = "# CS threads loaded",
+	/* A22:
+	 * Total time in clocks the geometry shader spent active on all cores.
+	 */
+	[22] = "Geometry Shader Active Time",
+	/* A24:
+	 * Total time in clocks the geometry shader spent stalled on all cores -
+	 * and the entire core was stalled as well.
+	 */
+	[24] = "Geometry Shader Stall Time - Core Stall",
+	/* A25: Number of GS threads loaded at any time in the EUs. */
+	[25] = "# GS threads loaded",
+	/* A27:
+	 * Total time in clocks the pixel shader spent active on all cores.
+	 */
+	[27] = "Pixel Shader Active Time",
+	/* A29:
+	 * Total time in clocks the pixel shader spent stalled on all cores -
+	 * and the entire core was stalled as well.
+	 */
+	[29] = "Pixel Shader Stall Time - Core Stall",
+	/* A30: Number of PS threads loaded at any given time in the EUs. */
+	[30] = "# PS threads loaded",
+	/* A32: Count of pixels that pass the fast check (8x8). */
+	[32] = "HiZ Fast Z Test Pixels Passing",
+	/* A33: Count of pixels that fail the fast check (8x8). */
+	[33] = "HiZ Fast Z Test Pixels Failing",
+	/* A34: Count of pixels passing the slow check (2x2). */
+	[34] = "Slow Z Test Pixels Passing",
+	/* A35: Count of pixels that fail the slow check (2x2). */
+	[35] = "Slow Z Test Pixels Failing",
+	/* A36: Number of pixels/samples killed in the pixel shader.
+	 * Ivybridge/Baytrail Erratum: Count reported is 2X the actual count for
+	 * dual source render target messages i.e. when PS has two output colors.
+	 */
+	[36] = "Pixel Kill Count",
+	/* A37:
+	 * Number of pixels/samples that fail alpha-test.  Alpha to coverage
+	 * may have some challenges in per-pixel invocation.
+	 */
+	[37] = "Alpha Test Pixels Failed",
+	/* A38:
+	 * Number of pixels/samples failing stencil test after the pixel shader
+	 * has executed.
+	 */
+	[38] = "Post PS Stencil Pixels Failed",
+	/* A39:
+	 * Number of pixels/samples fail Z test after the pixel shader has
+	 * executed.
+	 */
+	[39] = "Post PS Z buffer Pixels Failed",
+	/* A40:
+	 * Number of render target writes.  MRT scenarios will cause this
+	 * counter to increment multiple times.
+	 */
+	[40] = "3D/GPGPU Render Target Writes",
+	/* A41: Render engine is not idle.
+	 *
+	 * GPU Busy aggregate counter doesn't increment under the following
+	 * conditions:
+	 *
+	 * 1. Context Switch in Progress.
+	 * 2. GPU stalled on executing MI_WAIT_FOR_EVENT.
+	 * 3. GPU stalled on execution MI_SEMAPHORE_MBOX.
+	 * 4. RCS idle but other parts of GPU active (e.g. only media engines
+	 *    active)
+	 */
+	[41] = "Render Engine Busy",
+	/* A42:
+	 * VSunit is stalling VF (upstream unit) and starving HS (downstream
+	 * unit).
+	 */
+	[42] = "VS bottleneck",
+	/* A43:
+	 * GSunit is stalling DS (upstream unit) and starving SOL (downstream
+	 * unit).
+	 */
+	[43] = "GS bottleneck",
+};
+
+/**
+ * Ivybridge - Counter Select = 101
+ * A4   A3   A2   A1   A0   TIMESTAMP  ReportID
+ * A12  A11  A10  A9   A8   A7   A6    A5
+ * A20  A19  A18  A17  A16  A15  A14   A13
+ * A28  A27  A26  A25  A24  A23  A22   A21
+ * A36  A35  A34  A33  A32  A31  A30   A29
+ * A44  A43  A42  A41  A40  A39  A38   A37
+ * C3   C2   C1   C0   B3   B2   B1    B0
+ * C11  C10  C9   C8   C7   C6   C5    C4
+ */
+const int gen7_counter_format = 5; /* 0b101 */
+
 int have_totals = 0;
 uint32_t *totals;
 uint32_t *last_counter;
@@ -243,6 +400,40 @@
 	drm_intel_bo_unreference(stats_bo);
 }
 
+static void
+gen7_get_counters(void)
+{
+	int i;
+	drm_intel_bo *stats_bo;
+	uint32_t *stats_result;
+
+	stats_bo = drm_intel_bo_alloc(bufmgr, "stats", 4096, 4096);
+
+	BEGIN_BATCH(3);
+	OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT | (3 - 2));
+	OUT_RELOC(stats_bo,
+		  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, 0);
+	OUT_BATCH(0);
+	ADVANCE_BATCH();
+
+	intel_batchbuffer_flush_on_ring(batch, I915_EXEC_RENDER);
+
+	drm_intel_bo_map(stats_bo, 0);
+	stats_result = stats_bo->virtual;
+	/* skip REPORT_ID, TIMESTAMP */
+	stats_result += 3;
+	for (i = 0; i < GEN7_COUNTER_COUNT; i++) {
+		/* Ignore "Reserved" counters */
+		if (!gen7_counter_names[i])
+			continue;
+		totals[i] += stats_result[i] - last_counter[i];
+		last_counter[i] = stats_result[i];
+	}
+
+	drm_intel_bo_unmap(stats_bo);
+	drm_intel_bo_unreference(stats_bo);
+}
+
 #define STATS_CHECK_FREQUENCY	100
 #define STATS_REPORT_FREQUENCY	2
 
@@ -279,6 +470,11 @@
 		counter_count = GEN6_COUNTER_COUNT;
 		counter_format = gen6_counter_format;
 		get_counters = gen6_get_counters;
+	} else if (IS_GEN7(devid)) {
+		counter_name = gen7_counter_names;
+		counter_count = GEN7_COUNTER_COUNT;
+		counter_format = gen7_counter_format;
+		get_counters = gen7_get_counters;
 	} else {
 		printf("This tool is not yet supported on your platform.\n");
 		abort();
@@ -304,6 +500,9 @@
 			if (l % (STATS_CHECK_FREQUENCY / STATS_REPORT_FREQUENCY) == 0) {
 				if (have_totals) {
 					for (i = 0; i < counter_count; i++) {
+						/* Ignore "Reserved" counters */
+						if (!counter_name[i])
+							continue;
 						printf("%s: %u\n", counter_name[i],
 						       totals[i]);
 						totals[i] = 0;