intel: implement intelCmd*Query()
diff --git a/icd/intel/CMakeLists.txt b/icd/intel/CMakeLists.txt
index 0896bf2..15d369b 100644
--- a/icd/intel/CMakeLists.txt
+++ b/icd/intel/CMakeLists.txt
@@ -21,6 +21,7 @@
     cmd.c
     cmd_prepare.c
     cmd_pipeline.c
+    cmd_query.c
     dev.c
     dispatch.c
     dset.c
diff --git a/icd/intel/cmd.c b/icd/intel/cmd.c
index fb73514..b193d20 100644
--- a/icd/intel/cmd.c
+++ b/icd/intel/cmd.c
@@ -448,29 +448,6 @@
 {
 }
 
-XGL_VOID XGLAPI intelCmdBeginQuery(
-    XGL_CMD_BUFFER                              cmdBuffer,
-    XGL_QUERY_POOL                              queryPool,
-    XGL_UINT                                    slot,
-    XGL_FLAGS                                   flags)
-{
-}
-
-XGL_VOID XGLAPI intelCmdEndQuery(
-    XGL_CMD_BUFFER                              cmdBuffer,
-    XGL_QUERY_POOL                              queryPool,
-    XGL_UINT                                    slot)
-{
-}
-
-XGL_VOID XGLAPI intelCmdResetQueryPool(
-    XGL_CMD_BUFFER                              cmdBuffer,
-    XGL_QUERY_POOL                              queryPool,
-    XGL_UINT                                    startQuery,
-    XGL_UINT                                    queryCount)
-{
-}
-
 XGL_VOID XGLAPI intelCmdWriteTimestamp(
     XGL_CMD_BUFFER                              cmdBuffer,
     XGL_TIMESTAMP_TYPE                          timestampType,
diff --git a/icd/intel/cmd_pipeline.c b/icd/intel/cmd_pipeline.c
index 37395ad..99489b2 100644
--- a/icd/intel/cmd_pipeline.c
+++ b/icd/intel/cmd_pipeline.c
@@ -1205,6 +1205,18 @@
     gen6_PIPE_CONTROL(cmd, pipe_control_dw0, NULL, 0);
 }
 
+void cmd_batch_depth_count(struct intel_cmd *cmd,
+                           struct intel_bo *bo,
+                           XGL_GPU_SIZE offset)
+{
+    cmd_wa_gen6_pre_depth_stall_write(cmd);
+
+    gen6_PIPE_CONTROL(cmd,
+            GEN6_PIPE_CONTROL_DEPTH_STALL |
+            GEN6_PIPE_CONTROL_WRITE_PS_DEPTH_COUNT,
+            bo, offset);
+}
+
 static void gen6_cc_states(struct intel_cmd *cmd)
 {
     const struct intel_blend_state *blend = cmd->bind.state.blend;
diff --git a/icd/intel/cmd_priv.h b/icd/intel/cmd_priv.h
index 42795c5..ece081c 100644
--- a/icd/intel/cmd_priv.h
+++ b/icd/intel/cmd_priv.h
@@ -209,6 +209,10 @@
 
 void cmd_batch_flush(struct intel_cmd *cmd, uint32_t pipe_control_dw0);
 
+void cmd_batch_depth_count(struct intel_cmd *cmd,
+                           struct intel_bo *bo,
+                           XGL_GPU_SIZE offset);
+
 /**
  * Reserve \p len DWords in the state buffer for building a hardware state.
  * The current writer position is aligned to \p alignment first.  Both the
diff --git a/icd/intel/cmd_query.c b/icd/intel/cmd_query.c
new file mode 100644
index 0000000..0fb5894
--- /dev/null
+++ b/icd/intel/cmd_query.c
@@ -0,0 +1,155 @@
+/*
+ * XGL
+ *
+ * Copyright (C) 2014 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "mem.h"
+#include "obj.h"
+#include "query.h"
+#include "cmd_priv.h"
+
+static void gen6_MI_STORE_REGISTER_MEM(struct intel_cmd *cmd,
+                                       struct intel_bo *bo,
+                                       uint32_t offset,
+                                       uint32_t reg)
+{
+    const uint8_t cmd_len = 3;
+    uint32_t dw0 = GEN6_MI_CMD(MI_STORE_REGISTER_MEM) |
+                   (cmd_len - 2);
+
+    if (cmd_gen(cmd) == INTEL_GEN(6))
+        dw0 |= GEN6_MI_STORE_REGISTER_MEM_DW0_USE_GGTT;
+
+    cmd_batch_reserve(cmd, cmd_len);
+    cmd_batch_write(cmd, dw0);
+    cmd_batch_write(cmd, reg);
+    cmd_batch_reloc(cmd, offset, bo, INTEL_RELOC_GGTT | INTEL_RELOC_WRITE);
+}
+
+static void gen6_MI_STORE_DATA_IMM(struct intel_cmd *cmd,
+                                   struct intel_bo *bo,
+                                   uint32_t offset,
+                                   uint64_t val)
+{
+    const uint8_t cmd_len = 5;
+    uint32_t dw0 = GEN6_MI_CMD(MI_STORE_DATA_IMM) |
+                   (cmd_len - 2);
+
+    if (cmd_gen(cmd) == INTEL_GEN(6))
+        dw0 |= GEN6_MI_STORE_DATA_IMM_DW0_USE_GGTT;
+
+    cmd_batch_reserve(cmd, cmd_len);
+    cmd_batch_write(cmd, dw0);
+    cmd_batch_write(cmd, 0);
+    cmd_batch_reloc(cmd, offset, bo, INTEL_RELOC_GGTT | INTEL_RELOC_WRITE);
+    cmd_batch_write(cmd, (uint32_t) val);
+    cmd_batch_write(cmd, (uint32_t) (val >> 32));
+}
+
+static void cmd_query_pipeline_statistics(struct intel_cmd *cmd,
+                                          struct intel_bo *bo,
+                                          XGL_GPU_SIZE offset)
+{
+    const uint32_t regs[] = {
+        GEN6_REG_PS_INVOCATION_COUNT,
+        GEN6_REG_CL_PRIMITIVES_COUNT,
+        GEN6_REG_CL_INVOCATION_COUNT,
+        GEN6_REG_VS_INVOCATION_COUNT,
+        GEN6_REG_GS_INVOCATION_COUNT,
+        GEN6_REG_GS_PRIMITIVES_COUNT,
+        GEN6_REG_IA_PRIMITIVES_COUNT,
+        GEN6_REG_IA_VERTICES_COUNT,
+        (cmd_gen(cmd) >= INTEL_GEN(7)) ? GEN6_REG_HS_INVOCATION_COUNT : 0,
+        (cmd_gen(cmd) >= INTEL_GEN(7)) ? GEN6_REG_DS_INVOCATION_COUNT : 0,
+        0,
+    };
+    XGL_UINT i;
+
+    cmd_batch_flush(cmd, GEN6_PIPE_CONTROL_CS_STALL);
+
+    for (i = 0; i < ARRAY_SIZE(regs); i++) {
+        if (regs[i]) {
+            /* store lower 32 bits */
+            gen6_MI_STORE_REGISTER_MEM(cmd, bo, offset, regs[i]);
+            /* store higher 32 bits */
+            gen6_MI_STORE_REGISTER_MEM(cmd, bo, offset + 4, regs[i] + 4);
+        } else {
+            gen6_MI_STORE_DATA_IMM(cmd, bo, offset, 0);
+        }
+    }
+}
+
+XGL_VOID XGLAPI intelCmdBeginQuery(
+    XGL_CMD_BUFFER                              cmdBuffer,
+    XGL_QUERY_POOL                              queryPool,
+    XGL_UINT                                    slot,
+    XGL_FLAGS                                   flags)
+{
+    struct intel_cmd *cmd = intel_cmd(cmdBuffer);
+    struct intel_query *query = intel_query(queryPool);
+    struct intel_bo *bo = query->obj.mem->bo;
+    const XGL_GPU_SIZE offset = query->slot_stride * slot;
+
+    switch (query->type) {
+    case XGL_QUERY_OCCLUSION:
+        cmd_batch_depth_count(cmd, bo, offset);
+        break;
+    case XGL_QUERY_PIPELINE_STATISTICS:
+        cmd_query_pipeline_statistics(cmd, bo, offset);
+        break;
+    default:
+        cmd->result = XGL_ERROR_UNKNOWN;
+        break;
+    }
+}
+
+XGL_VOID XGLAPI intelCmdEndQuery(
+    XGL_CMD_BUFFER                              cmdBuffer,
+    XGL_QUERY_POOL                              queryPool,
+    XGL_UINT                                    slot)
+{
+    struct intel_cmd *cmd = intel_cmd(cmdBuffer);
+    struct intel_query *query = intel_query(queryPool);
+    struct intel_bo *bo = query->obj.mem->bo;
+    const XGL_GPU_SIZE offset = query->slot_stride * slot;
+
+    switch (query->type) {
+    case XGL_QUERY_OCCLUSION:
+        cmd_batch_depth_count(cmd, bo, offset + sizeof(uint64_t));
+        break;
+    case XGL_QUERY_PIPELINE_STATISTICS:
+        cmd_query_pipeline_statistics(cmd, bo,
+                offset + sizeof(XGL_PIPELINE_STATISTICS_DATA));
+        break;
+    default:
+        cmd->result = XGL_ERROR_UNKNOWN;
+        break;
+    }
+}
+
+XGL_VOID XGLAPI intelCmdResetQueryPool(
+    XGL_CMD_BUFFER                              cmdBuffer,
+    XGL_QUERY_POOL                              queryPool,
+    XGL_UINT                                    startQuery,
+    XGL_UINT                                    queryCount)
+{
+}
diff --git a/icd/intel/query.c b/icd/intel/query.c
index dc29ed8..d354df3 100644
--- a/icd/intel/query.c
+++ b/icd/intel/query.c
@@ -81,7 +81,7 @@
      */
     switch (info->queryType) {
     case XGL_QUERY_OCCLUSION:
-        query->slot_stride = u_align(sizeof(uint32_t) * 2, 64);
+        query->slot_stride = u_align(sizeof(uint64_t) * 2, 64);
         break;
     case XGL_QUERY_PIPELINE_STATISTICS:
         query->slot_stride =