intel: update winsys

Mainly to deprecate INTEL_DOMAIN_x.
diff --git a/icd/intel/cmd.c b/icd/intel/cmd.c
index ebf84a4..5dd962d 100644
--- a/icd/intel/cmd.c
+++ b/icd/intel/cmd.c
@@ -39,7 +39,7 @@
     void *ptr;
 
     bo = intel_winsys_alloc_buffer(winsys,
-            "batch buffer", bo_size, INTEL_DOMAIN_CPU);
+            "batch buffer", bo_size, true);
     if (!bo)
         return XGL_ERROR_OUT_OF_GPU_MEMORY;
 
@@ -262,7 +262,7 @@
 
         err = intel_bo_add_reloc(reloc->writer->bo,
                 sizeof(uint32_t) * reloc->pos, reloc->bo, reloc->val,
-                reloc->read_domains, reloc->write_domain, &presumed_offset);
+                reloc->flags, &presumed_offset);
         if (err) {
             cmd->result = XGL_ERROR_UNKNOWN;
             break;
diff --git a/icd/intel/cmd_pipeline.c b/icd/intel/cmd_pipeline.c
index 0b37c55..9409252 100644
--- a/icd/intel/cmd_pipeline.c
+++ b/icd/intel/cmd_pipeline.c
@@ -100,8 +100,6 @@
    const uint8_t cmd_len = 5;
    const uint32_t dw0 = GEN_RENDER_CMD(3D, GEN6, PIPE_CONTROL) |
                         (cmd_len - 2);
-   const uint32_t read_domains = INTEL_DOMAIN_INSTRUCTION;
-   const uint32_t write_domain = INTEL_DOMAIN_INSTRUCTION;
 
    CMD_ASSERT(cmd, 6, 7.5);
 
@@ -174,10 +172,12 @@
    cmd_batch_reserve_reloc(cmd, cmd_len, (bool) bo);
    cmd_batch_write(cmd, dw0);
    cmd_batch_write(cmd, dw1);
-   if (bo)
-       cmd_batch_reloc(cmd, bo_offset, bo, read_domains, write_domain);
-   else
+   if (bo) {
+       cmd_batch_reloc(cmd, bo_offset, bo, INTEL_RELOC_GGTT |
+                                           INTEL_RELOC_WRITE);
+   } else {
        cmd_batch_write(cmd, 0);
+   }
    cmd_batch_write(cmd, 0);
    cmd_batch_write(cmd, 0);
 }
@@ -275,8 +275,8 @@
 
     cmd_batch_reserve_reloc(cmd, cmd_len, 2);
     cmd_batch_write(cmd, dw0);
-    cmd_batch_reloc(cmd, offset, mem->bo, INTEL_DOMAIN_VERTEX, 0);
-    cmd_batch_reloc(cmd, end_offset, mem->bo, INTEL_DOMAIN_VERTEX, 0);
+    cmd_batch_reloc(cmd, offset, mem->bo, 0);
+    cmd_batch_reloc(cmd, end_offset, mem->bo, 0);
 }
 
 static inline void
@@ -338,8 +338,7 @@
     cmd_batch_write(cmd, view->cmd[0]);
     if (view->img) {
         cmd_batch_reloc(cmd, view->cmd[1], view->img->obj.mem->bo,
-                        INTEL_DOMAIN_RENDER,
-                        INTEL_DOMAIN_RENDER);
+                        INTEL_RELOC_WRITE);
     } else {
         cmd_batch_write(cmd, 0);
     }
@@ -367,8 +366,7 @@
     cmd_batch_write(cmd, view->cmd[6]);
     if (view->img) {
         cmd_batch_reloc(cmd, view->cmd[7], view->img->obj.mem->bo,
-                        INTEL_DOMAIN_RENDER,
-                        INTEL_DOMAIN_RENDER);
+                        INTEL_RELOC_WRITE);
     } else {
         cmd_batch_write(cmd, 0);
     }
@@ -392,8 +390,7 @@
     cmd_batch_write(cmd, view->cmd[8]);
     if (view->img) {
         cmd_batch_reloc(cmd, view->cmd[9], view->img->obj.mem->bo,
-                        INTEL_DOMAIN_RENDER,
-                        INTEL_DOMAIN_RENDER);
+                        INTEL_RELOC_WRITE);
     } else {
         cmd_batch_write(cmd, 0);
     }
@@ -761,7 +758,7 @@
 
                 memcpy(dw, view->cmd, sizeof(uint32_t) * view->cmd_len);
                 cmd_state_reloc(cmd, 1, view->cmd[1], view->img->obj.mem->bo,
-                        INTEL_DOMAIN_RENDER, INTEL_DOMAIN_RENDER);
+                        INTEL_RELOC_WRITE);
                 cmd_state_advance(cmd, view->cmd_len);
             }
             break;
@@ -775,7 +772,7 @@
 
                 memcpy(dw, view->cmd, sizeof(uint32_t) * view->cmd_len);
                 cmd_state_reloc(cmd, 1, view->cmd[1], view->mem->bo,
-                        INTEL_DOMAIN_RENDER, INTEL_DOMAIN_RENDER);
+                        INTEL_RELOC_WRITE);
                 cmd_state_advance(cmd, view->cmd_len);
             }
             break;
diff --git a/icd/intel/cmd_priv.h b/icd/intel/cmd_priv.h
index 1ea8378..0158199 100644
--- a/icd/intel/cmd_priv.h
+++ b/icd/intel/cmd_priv.h
@@ -40,18 +40,7 @@
     uint32_t val;
     struct intel_bo *bo;
 
-    /*
-     * With application state tracking promised by XGL, we should be able to
-     * set
-     *
-     *   I915_EXEC_NO_RELOC
-     *   I915_EXEC_HANDLE_LUT
-     *   I915_EXEC_IS_PINNED
-     *
-     * once we figure them out.
-     */
-    uint16_t read_domains;
-    uint16_t write_domain;
+    uint32_t flags;
 };
 
 static inline int cmd_gen(const struct intel_cmd *cmd)
@@ -80,8 +69,7 @@
                                         struct intel_cmd_writer *writer,
                                         XGL_UINT pos, uint32_t val,
                                         struct intel_bo *bo,
-                                        uint16_t read_domains,
-                                        uint16_t write_domain)
+                                        uint32_t flags)
 {
     struct intel_cmd_reloc *reloc = &cmd->relocs[cmd->reloc_used];
 
@@ -91,8 +79,7 @@
     reloc->pos = pos;
     reloc->val = val;
     reloc->bo = bo;
-    reloc->read_domains = read_domains;
-    reloc->write_domain = write_domain;
+    reloc->flags = flags;
 
     cmd->reloc_used++;
 }
@@ -151,13 +138,11 @@
  */
 static inline void cmd_batch_reloc(struct intel_cmd *cmd,
                                    uint32_t val, struct intel_bo *bo,
-                                   uint16_t read_domains,
-                                   uint16_t write_domain)
+                                   uint32_t flags)
 {
     struct intel_cmd_writer *writer = &cmd->batch;
 
-    cmd_writer_add_reloc(cmd, writer, writer->used, val,
-            bo, read_domains, write_domain);
+    cmd_writer_add_reloc(cmd, writer, writer->used, val, bo, flags);
 
     writer->used++;
 }
@@ -210,16 +195,13 @@
     const struct intel_cmd_writer *kernel = &cmd->kernel;
 
     cmd_reserve_reloc(cmd, 5);
-    cmd_writer_add_reloc(cmd, writer, 2, 1,
-            state->bo, INTEL_DOMAIN_SAMPLER, 0);
-    cmd_writer_add_reloc(cmd, writer, 3, 1,
-            state->bo, INTEL_DOMAIN_RENDER | INTEL_DOMAIN_INSTRUCTION, 0);
-    cmd_writer_add_reloc(cmd, writer, 5, 1,
-            kernel->bo, INTEL_DOMAIN_INSTRUCTION, 0);
-    cmd_writer_add_reloc(cmd, writer, 7, 1 + (state->size << 2),
-            state->bo, INTEL_DOMAIN_RENDER | INTEL_DOMAIN_INSTRUCTION, 0);
-    cmd_writer_add_reloc(cmd, writer, 9, 1 + (kernel->size << 2),
-            kernel->bo, INTEL_DOMAIN_INSTRUCTION, 0);
+    cmd_writer_add_reloc(cmd, writer, 2, 1, state->bo, 0);
+    cmd_writer_add_reloc(cmd, writer, 3, 1, state->bo, 0);
+    cmd_writer_add_reloc(cmd, writer, 5, 1, kernel->bo, 0);
+    cmd_writer_add_reloc(cmd, writer, 7, 1 +
+            (state->size << 2), state->bo, 0);
+    cmd_writer_add_reloc(cmd, writer, 9, 1 +
+            (kernel->size << 2), kernel->bo, 0);
 
     if (cmd->batch.used & 1) {
         cmd_batch_reserve(cmd, 1);
@@ -278,13 +260,11 @@
 static inline void cmd_state_reloc(struct intel_cmd *cmd,
                                    XGL_INT offset, uint32_t val,
                                    struct intel_bo *bo,
-                                   uint16_t read_domains,
-                                   uint16_t write_domain)
+                                   uint32_t flags)
 {
     struct intel_cmd_writer *writer = &cmd->state;
 
-    cmd_writer_add_reloc(cmd, writer, writer->used + offset, val,
-            bo, read_domains, write_domain);
+    cmd_writer_add_reloc(cmd, writer, writer->used + offset, val, bo, flags);
 }
 
 /**
diff --git a/icd/intel/dev.c b/icd/intel/dev.c
index 3613f54..f8d7720 100644
--- a/icd/intel/dev.c
+++ b/icd/intel/dev.c
@@ -99,7 +99,7 @@
     }
 
     dev->cmd_scratch_bo = intel_winsys_alloc_buffer(dev->winsys,
-            "command buffer scratch", 4096, INTEL_DOMAIN_INSTRUCTION);
+            "command buffer scratch", 4096, false);
     if (!dev->cmd_scratch_bo) {
         intel_dev_destroy(dev);
         return XGL_ERROR_OUT_OF_GPU_MEMORY;
diff --git a/icd/intel/kmd/winsys.h b/icd/intel/kmd/winsys.h
index df76154..e080a08 100644
--- a/icd/intel/kmd/winsys.h
+++ b/icd/intel/kmd/winsys.h
@@ -28,6 +28,7 @@
 #ifndef INTEL_WINSYS_H
 #define INTEL_WINSYS_H
 
+#include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
 
@@ -45,14 +46,10 @@
 };
 
 /* this is compatible with i915_drm.h's definitions */
-enum intel_domain_flag {
-   INTEL_DOMAIN_CPU           = 0x00000001,
-   INTEL_DOMAIN_RENDER        = 0x00000002,
-   INTEL_DOMAIN_SAMPLER       = 0x00000004,
-   INTEL_DOMAIN_COMMAND	      = 0x00000008,
-   INTEL_DOMAIN_INSTRUCTION   = 0x00000010,
-   INTEL_DOMAIN_VERTEX        = 0x00000020,
-   INTEL_DOMAIN_GTT           = 0x00000040,
+enum intel_reloc_flag {
+   INTEL_RELOC_FENCE          = 1 << 0,
+   INTEL_RELOC_GGTT           = 1 << 1,
+   INTEL_RELOC_WRITE          = 1 << 2,
 };
 
 /* this is compatible with i915_drm.h's definitions */
@@ -74,7 +71,10 @@
 struct intel_winsys_info {
    int devid;
 
-   int max_batch_size;
+   /* the sizes of the aperture in bytes */
+   size_t aperture_total;
+   size_t aperture_mappable;
+
    bool has_llc;
    bool has_address_swizzling;
    bool has_logical_context;
@@ -120,7 +120,7 @@
  * \param tiling           Tiling mode.
  * \param pitch            Pitch of the bo.
  * \param height           Height of the bo.
- * \param initial_domain   Initial (write) domain.
+ * \param cpu_init         Will be initialized by CPU.
  */
 struct intel_bo *
 intel_winsys_alloc_bo(struct intel_winsys *winsys,
@@ -128,7 +128,7 @@
                       enum intel_tiling_mode tiling,
                       unsigned long pitch,
                       unsigned long height,
-                      uint32_t initial_domain);
+                      bool cpu_init);
 
 /**
  * Allocate a linear buffer object.
@@ -137,10 +137,10 @@
 intel_winsys_alloc_buffer(struct intel_winsys *winsys,
                           const char *name,
                           unsigned long size,
-                          uint32_t initial_domain)
+                          bool cpu_init)
 {
    return intel_winsys_alloc_bo(winsys, name,
-         INTEL_TILING_NONE, size, 1, initial_domain);
+         INTEL_TILING_NONE, size, 1, cpu_init);
 }
 
 /**
@@ -224,8 +224,7 @@
  * sequential writes, but reads would be very slow.  Callers always have a
  * linear view of the bo.
  *
- * map_unsynchronized() is similar to map_gtt(), except that it does not
- * block.
+ * map_gtt_async() is similar to map_gtt(), except that it does not block.
  */
 void *
 intel_bo_map(struct intel_bo *bo, bool write_enable);
@@ -234,7 +233,7 @@
 intel_bo_map_gtt(struct intel_bo *bo);
 
 void *
-intel_bo_map_unsynchronized(struct intel_bo *bo);
+intel_bo_map_gtt_async(struct intel_bo *bo);
 
 /**
  * Unmap \p bo.
@@ -268,8 +267,7 @@
 int
 intel_bo_add_reloc(struct intel_bo *bo, uint32_t offset,
                    struct intel_bo *target_bo, uint32_t target_offset,
-                   uint32_t read_domains, uint32_t write_domain,
-                   uint64_t *presumed_offset);
+                   uint32_t flags, uint64_t *presumed_offset);
 
 /**
  * Return the current number of relocations.
diff --git a/icd/intel/kmd/winsys_drm.c b/icd/intel/kmd/winsys_drm.c
index 571e3b8..ba022b9 100644
--- a/icd/intel/kmd/winsys_drm.c
+++ b/icd/intel/kmd/winsys_drm.c
@@ -40,8 +40,6 @@
 #include "icd.h"
 #include "winsys.h"
 
-#define BATCH_SZ (8192 * sizeof(uint32_t))
-
 struct intel_winsys {
    int fd;
    drm_intel_bufmgr *bufmgr;
@@ -132,8 +130,6 @@
 
    info->devid = drm_intel_bufmgr_gem_get_devid(winsys->bufmgr);
 
-   info->max_batch_size = BATCH_SZ;
-
    get_param(winsys, I915_PARAM_HAS_LLC, &val);
    info->has_llc = val;
    info->has_address_swizzling = test_address_swizzling(winsys);
@@ -159,6 +155,8 @@
 struct intel_winsys *
 intel_winsys_create_for_fd(int fd)
 {
+   /* so that we can have enough (up to 4094) relocs per bo */
+   const int batch_size = sizeof(uint32_t) * 8192;
    struct intel_winsys *winsys;
 
    winsys = icd_alloc(sizeof(*winsys), 0, XGL_SYSTEM_ALLOC_INTERNAL);
@@ -169,7 +167,7 @@
 
    winsys->fd = fd;
 
-   winsys->bufmgr = drm_intel_bufmgr_gem_init(winsys->fd, BATCH_SZ);
+   winsys->bufmgr = drm_intel_bufmgr_gem_init(winsys->fd, batch_size);
    if (!winsys->bufmgr) {
       icd_free(winsys);
       return NULL;
@@ -183,12 +181,7 @@
 
    /*
     * No need to implicitly set up a fence register for each non-linear reloc
-    * entry.  When a fence register is needed for a reloc entry,
-    * drm_intel_bo_emit_reloc_fence() will be called explicitly.
-    *
-    * intel_bo_add_reloc() currently lacks "bool fenced" for this to work.
-    * But we never need a fence register on GEN4+ so we do not need to worry
-    * about it yet.
+    * entry.  INTEL_RELOC_FENCE will be set on reloc entries that need them.
     */
    drm_intel_bufmgr_gem_enable_fenced_relocs(winsys->bufmgr);
 
@@ -224,10 +217,8 @@
                       enum intel_tiling_mode tiling,
                       unsigned long pitch,
                       unsigned long height,
-                      uint32_t initial_domain)
+                      bool cpu_init)
 {
-   const bool for_render =
-      (initial_domain & (INTEL_DOMAIN_RENDER | INTEL_DOMAIN_INSTRUCTION));
    const unsigned int alignment = 4096; /* always page-aligned */
    unsigned long size;
    drm_intel_bo *bo;
@@ -250,12 +241,12 @@
 
    size = pitch * height;
 
-   if (for_render) {
-      bo = drm_intel_bo_alloc_for_render(winsys->bufmgr,
-            name, size, alignment);
+   if (cpu_init) {
+      bo = drm_intel_bo_alloc(winsys->bufmgr, name, size, alignment);
    }
    else {
-      bo = drm_intel_bo_alloc(winsys->bufmgr, name, size, alignment);
+      bo = drm_intel_bo_alloc_for_render(winsys->bufmgr,
+            name, size, alignment);
    }
 
    if (bo && tiling != INTEL_TILING_NONE) {
@@ -466,7 +457,7 @@
 }
 
 void *
-intel_bo_map_unsynchronized(struct intel_bo *bo)
+intel_bo_map_gtt_async(struct intel_bo *bo)
 {
    int err;
 
@@ -504,14 +495,37 @@
 int
 intel_bo_add_reloc(struct intel_bo *bo, uint32_t offset,
                    struct intel_bo *target_bo, uint32_t target_offset,
-                   uint32_t read_domains, uint32_t write_domain,
-                   uint64_t *presumed_offset)
+                   uint32_t flags, uint64_t *presumed_offset)
 {
+   uint32_t read_domains, write_domain;
    int err;
 
-   err = drm_intel_bo_emit_reloc(gem_bo(bo), offset,
-         gem_bo(target_bo), target_offset,
-         read_domains, write_domain);
+   if (flags & INTEL_RELOC_WRITE) {
+      /*
+       * Because of the translation to domains, INTEL_RELOC_GGTT should only
+       * be set on GEN6 when the bo is written by MI_* or PIPE_CONTROL.  The
+       * kernel will translate it back to INTEL_RELOC_GGTT.
+       */
+      write_domain = (flags & INTEL_RELOC_GGTT) ?
+         I915_GEM_DOMAIN_INSTRUCTION : I915_GEM_DOMAIN_RENDER;
+      read_domains = write_domain;
+   } else {
+      write_domain = 0;
+      read_domains = I915_GEM_DOMAIN_RENDER |
+                     I915_GEM_DOMAIN_SAMPLER |
+                     I915_GEM_DOMAIN_INSTRUCTION |
+                     I915_GEM_DOMAIN_VERTEX;
+   }
+
+   if (flags & INTEL_RELOC_FENCE) {
+      err = drm_intel_bo_emit_reloc_fence(gem_bo(bo), offset,
+            gem_bo(target_bo), target_offset,
+            read_domains, write_domain);
+   } else {
+      err = drm_intel_bo_emit_reloc(gem_bo(bo), offset,
+            gem_bo(target_bo), target_offset,
+            read_domains, write_domain);
+   }
 
    *presumed_offset = gem_bo(target_bo)->offset64 + target_offset;
 
diff --git a/icd/intel/mem.h b/icd/intel/mem.h
index 7522794..10b5580 100644
--- a/icd/intel/mem.h
+++ b/icd/intel/mem.h
@@ -45,7 +45,7 @@
 
 static inline void *intel_mem_map(struct intel_mem *mem, XGL_FLAGS flags)
 {
-    return intel_bo_map_unsynchronized(mem->bo);
+    return intel_bo_map_gtt_async(mem->bo);
 }
 
 static inline void *intel_mem_map_sync(struct intel_mem *mem, bool rw)
diff --git a/icd/intel/queue.c b/icd/intel/queue.c
index e80aa37..937c1f0 100644
--- a/icd/intel/queue.c
+++ b/icd/intel/queue.c
@@ -56,7 +56,7 @@
     void *ptr;
 
     bo = intel_winsys_alloc_buffer(queue->dev->winsys,
-            "queue bo", size, INTEL_DOMAIN_CPU);
+            "queue bo", size, true);
     if (!bo)
         return NULL;