drm/i915: Avoid vmallocing a buffer for the relocations

... perform an access validation check up front instead and copy them in
on-demand, during i915_gem_object_pin_and_relocate(). As around 20% of
the CPU overhead may be spent inside vmalloc for the relocation entries
when submitting an execbuffer [for x11perf -aa10text], the savings are
considerable and result in around a 10% throughput increase [for glyphs].

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 72ab303..67998e8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3291,12 +3291,12 @@
 static int
 i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
 				 struct drm_file *file_priv,
-				 struct drm_i915_gem_exec_object2 *entry,
-				 struct drm_i915_gem_relocation_entry *relocs)
+				 struct drm_i915_gem_exec_object2 *entry)
 {
 	struct drm_device *dev = obj->dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_i915_gem_object *obj_priv = to_intel_bo(obj);
+	struct drm_i915_gem_relocation_entry __user *user_relocs;
 	int i, ret;
 	void __iomem *reloc_page;
 	bool need_fence;
@@ -3337,15 +3337,24 @@
 	/* Apply the relocations, using the GTT aperture to avoid cache
 	 * flushing requirements.
 	 */
+	user_relocs = (void __user *)(uintptr_t)entry->relocs_ptr;
 	for (i = 0; i < entry->relocation_count; i++) {
-		struct drm_i915_gem_relocation_entry *reloc= &relocs[i];
+		struct drm_i915_gem_relocation_entry reloc;
 		struct drm_gem_object *target_obj;
 		struct drm_i915_gem_object *target_obj_priv;
 		uint32_t reloc_val, reloc_offset;
 		uint32_t __iomem *reloc_entry;
 
+		ret = __copy_from_user_inatomic(&reloc,
+						user_relocs+i,
+						sizeof(reloc));
+		if (ret) {
+			i915_gem_object_unpin(obj);
+			return -EFAULT;
+		}
+
 		target_obj = drm_gem_object_lookup(obj->dev, file_priv,
-						   reloc->target_handle);
+						   reloc.target_handle);
 		if (target_obj == NULL) {
 			i915_gem_object_unpin(obj);
 			return -ENOENT;
@@ -3358,13 +3367,13 @@
 			 "presumed %08x delta %08x\n",
 			 __func__,
 			 obj,
-			 (int) reloc->offset,
-			 (int) reloc->target_handle,
-			 (int) reloc->read_domains,
-			 (int) reloc->write_domain,
+			 (int) reloc.offset,
+			 (int) reloc.target_handle,
+			 (int) reloc.read_domains,
+			 (int) reloc.write_domain,
 			 (int) target_obj_priv->gtt_offset,
-			 (int) reloc->presumed_offset,
-			 reloc->delta);
+			 (int) reloc.presumed_offset,
+			 reloc.delta);
 #endif
 
 		/* The target buffer should have appeared before us in the
@@ -3372,89 +3381,89 @@
 		 */
 		if (target_obj_priv->gtt_space == NULL) {
 			DRM_ERROR("No GTT space found for object %d\n",
-				  reloc->target_handle);
+				  reloc.target_handle);
 			drm_gem_object_unreference(target_obj);
 			i915_gem_object_unpin(obj);
 			return -EINVAL;
 		}
 
 		/* Validate that the target is in a valid r/w GPU domain */
-		if (reloc->write_domain & (reloc->write_domain - 1)) {
+		if (reloc.write_domain & (reloc.write_domain - 1)) {
 			DRM_ERROR("reloc with multiple write domains: "
 				  "obj %p target %d offset %d "
 				  "read %08x write %08x",
-				  obj, reloc->target_handle,
-				  (int) reloc->offset,
-				  reloc->read_domains,
-				  reloc->write_domain);
+				  obj, reloc.target_handle,
+				  (int) reloc.offset,
+				  reloc.read_domains,
+				  reloc.write_domain);
 			drm_gem_object_unreference(target_obj);
 			i915_gem_object_unpin(obj);
 			return -EINVAL;
 		}
-		if (reloc->write_domain & I915_GEM_DOMAIN_CPU ||
-		    reloc->read_domains & I915_GEM_DOMAIN_CPU) {
+		if (reloc.write_domain & I915_GEM_DOMAIN_CPU ||
+		    reloc.read_domains & I915_GEM_DOMAIN_CPU) {
 			DRM_ERROR("reloc with read/write CPU domains: "
 				  "obj %p target %d offset %d "
 				  "read %08x write %08x",
-				  obj, reloc->target_handle,
-				  (int) reloc->offset,
-				  reloc->read_domains,
-				  reloc->write_domain);
+				  obj, reloc.target_handle,
+				  (int) reloc.offset,
+				  reloc.read_domains,
+				  reloc.write_domain);
 			drm_gem_object_unreference(target_obj);
 			i915_gem_object_unpin(obj);
 			return -EINVAL;
 		}
-		if (reloc->write_domain && target_obj->pending_write_domain &&
-		    reloc->write_domain != target_obj->pending_write_domain) {
+		if (reloc.write_domain && target_obj->pending_write_domain &&
+		    reloc.write_domain != target_obj->pending_write_domain) {
 			DRM_ERROR("Write domain conflict: "
 				  "obj %p target %d offset %d "
 				  "new %08x old %08x\n",
-				  obj, reloc->target_handle,
-				  (int) reloc->offset,
-				  reloc->write_domain,
+				  obj, reloc.target_handle,
+				  (int) reloc.offset,
+				  reloc.write_domain,
 				  target_obj->pending_write_domain);
 			drm_gem_object_unreference(target_obj);
 			i915_gem_object_unpin(obj);
 			return -EINVAL;
 		}
 
-		target_obj->pending_read_domains |= reloc->read_domains;
-		target_obj->pending_write_domain |= reloc->write_domain;
+		target_obj->pending_read_domains |= reloc.read_domains;
+		target_obj->pending_write_domain |= reloc.write_domain;
 
 		/* If the relocation already has the right value in it, no
 		 * more work needs to be done.
 		 */
-		if (target_obj_priv->gtt_offset == reloc->presumed_offset) {
+		if (target_obj_priv->gtt_offset == reloc.presumed_offset) {
 			drm_gem_object_unreference(target_obj);
 			continue;
 		}
 
 		/* Check that the relocation address is valid... */
-		if (reloc->offset > obj->size - 4) {
+		if (reloc.offset > obj->size - 4) {
 			DRM_ERROR("Relocation beyond object bounds: "
 				  "obj %p target %d offset %d size %d.\n",
-				  obj, reloc->target_handle,
-				  (int) reloc->offset, (int) obj->size);
+				  obj, reloc.target_handle,
+				  (int) reloc.offset, (int) obj->size);
 			drm_gem_object_unreference(target_obj);
 			i915_gem_object_unpin(obj);
 			return -EINVAL;
 		}
-		if (reloc->offset & 3) {
+		if (reloc.offset & 3) {
 			DRM_ERROR("Relocation not 4-byte aligned: "
 				  "obj %p target %d offset %d.\n",
-				  obj, reloc->target_handle,
-				  (int) reloc->offset);
+				  obj, reloc.target_handle,
+				  (int) reloc.offset);
 			drm_gem_object_unreference(target_obj);
 			i915_gem_object_unpin(obj);
 			return -EINVAL;
 		}
 
 		/* and points to somewhere within the target object. */
-		if (reloc->delta >= target_obj->size) {
+		if (reloc.delta >= target_obj->size) {
 			DRM_ERROR("Relocation beyond target object bounds: "
 				  "obj %p target %d delta %d size %d.\n",
-				  obj, reloc->target_handle,
-				  (int) reloc->delta, (int) target_obj->size);
+				  obj, reloc.target_handle,
+				  (int) reloc.delta, (int) target_obj->size);
 			drm_gem_object_unreference(target_obj);
 			i915_gem_object_unpin(obj);
 			return -EINVAL;
@@ -3470,23 +3479,18 @@
 		/* Map the page containing the relocation we're going to
 		 * perform.
 		 */
-		reloc_offset = obj_priv->gtt_offset + reloc->offset;
+		reloc_offset = obj_priv->gtt_offset + reloc.offset;
 		reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
 						      (reloc_offset &
 						       ~(PAGE_SIZE - 1)),
 						      KM_USER0);
 		reloc_entry = (uint32_t __iomem *)(reloc_page +
 						   (reloc_offset & (PAGE_SIZE - 1)));
-		reloc_val = target_obj_priv->gtt_offset + reloc->delta;
+		reloc_val = target_obj_priv->gtt_offset + reloc.delta;
 
 		writel(reloc_val, reloc_entry);
 		io_mapping_unmap_atomic(reloc_page, KM_USER0);
 
-		/* The updated presumed offset for this entry will be
-		 * copied back out to the user.
-		 */
-		reloc->presumed_offset = target_obj_priv->gtt_offset;
-
 		drm_gem_object_unreference(target_obj);
 	}
 
@@ -3551,86 +3555,8 @@
 }
 
 static int
-i915_gem_get_relocs_from_user(struct drm_i915_gem_exec_object2 *exec_list,
-			      uint32_t buffer_count,
-			      struct drm_i915_gem_relocation_entry **relocs)
-{
-	uint32_t reloc_count = 0, reloc_index = 0, i;
-	int ret;
-
-	*relocs = NULL;
-	for (i = 0; i < buffer_count; i++) {
-		if (reloc_count + exec_list[i].relocation_count < reloc_count)
-			return -EINVAL;
-		reloc_count += exec_list[i].relocation_count;
-	}
-
-	*relocs = drm_calloc_large(reloc_count, sizeof(**relocs));
-	if (*relocs == NULL) {
-		DRM_ERROR("failed to alloc relocs, count %d\n", reloc_count);
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < buffer_count; i++) {
-		struct drm_i915_gem_relocation_entry __user *user_relocs;
-
-		user_relocs = (void __user *)(uintptr_t)exec_list[i].relocs_ptr;
-
-		ret = copy_from_user(&(*relocs)[reloc_index],
-				     user_relocs,
-				     exec_list[i].relocation_count *
-				     sizeof(**relocs));
-		if (ret != 0) {
-			drm_free_large(*relocs);
-			*relocs = NULL;
-			return -EFAULT;
-		}
-
-		reloc_index += exec_list[i].relocation_count;
-	}
-
-	return 0;
-}
-
-static int
-i915_gem_put_relocs_to_user(struct drm_i915_gem_exec_object2 *exec_list,
-			    uint32_t buffer_count,
-			    struct drm_i915_gem_relocation_entry *relocs)
-{
-	uint32_t reloc_count = 0, i;
-	int ret = 0;
-
-	if (relocs == NULL)
-	    return 0;
-
-	for (i = 0; i < buffer_count; i++) {
-		struct drm_i915_gem_relocation_entry __user *user_relocs;
-		int unwritten;
-
-		user_relocs = (void __user *)(uintptr_t)exec_list[i].relocs_ptr;
-
-		unwritten = copy_to_user(user_relocs,
-					 &relocs[reloc_count],
-					 exec_list[i].relocation_count *
-					 sizeof(*relocs));
-
-		if (unwritten) {
-			ret = -EFAULT;
-			goto err;
-		}
-
-		reloc_count += exec_list[i].relocation_count;
-	}
-
-err:
-	drm_free_large(relocs);
-
-	return ret;
-}
-
-static int
-i915_gem_check_execbuffer (struct drm_i915_gem_execbuffer2 *exec,
-			   uint64_t exec_offset)
+i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec,
+			  uint64_t exec_offset)
 {
 	uint32_t exec_start, exec_len;
 
@@ -3647,6 +3573,26 @@
 }
 
 static int
+validate_exec_list(struct drm_i915_gem_exec_object2 *exec,
+		   int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		char __user *ptr = (char __user *)(uintptr_t)exec[i].relocs_ptr;
+		size_t length = exec[i].relocation_count * sizeof(struct drm_i915_gem_relocation_entry);
+
+		if (!access_ok(VERIFY_READ, ptr, length))
+			return -EFAULT;
+
+		if (fault_in_pages_readable(ptr, length))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int
 i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv,
 		       struct drm_i915_gem_execbuffer2 *args,
@@ -3657,11 +3603,9 @@
 	struct drm_gem_object *batch_obj;
 	struct drm_i915_gem_object *obj_priv;
 	struct drm_clip_rect *cliprects = NULL;
-	struct drm_i915_gem_relocation_entry *relocs = NULL;
 	struct drm_i915_gem_request *request = NULL;
-	int ret, ret2, i, pinned = 0;
+	int ret, i, pinned = 0;
 	uint64_t exec_offset;
-	uint32_t reloc_index;
 	int pin_tries, flips;
 
 	struct intel_ring_buffer *ring = NULL;
@@ -3670,6 +3614,10 @@
 	if (ret)
 		return ret;
 
+	ret = validate_exec_list(exec_list, args->buffer_count);
+	if (ret)
+		return ret;
+
 #if WATCH_EXEC
 	DRM_INFO("buffers_ptr %d buffer_count %d len %08x\n",
 		  (int) args->buffers_ptr, args->buffer_count, args->batch_len);
@@ -3722,11 +3670,6 @@
 		goto pre_mutex_err;
 	}
 
-	ret = i915_gem_get_relocs_from_user(exec_list, args->buffer_count,
-					    &relocs);
-	if (ret != 0)
-		goto pre_mutex_err;
-
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		goto pre_mutex_err;
@@ -3765,19 +3708,16 @@
 	/* Pin and relocate */
 	for (pin_tries = 0; ; pin_tries++) {
 		ret = 0;
-		reloc_index = 0;
 
 		for (i = 0; i < args->buffer_count; i++) {
 			object_list[i]->pending_read_domains = 0;
 			object_list[i]->pending_write_domain = 0;
 			ret = i915_gem_object_pin_and_relocate(object_list[i],
 							       file_priv,
-							       &exec_list[i],
-							       &relocs[reloc_index]);
+							       &exec_list[i]);
 			if (ret)
 				break;
 			pinned = i + 1;
-			reloc_index += exec_list[i].relocation_count;
 		}
 		/* success */
 		if (ret == 0)
@@ -3967,20 +3907,6 @@
 	mutex_unlock(&dev->struct_mutex);
 
 pre_mutex_err:
-	/* Copy the updated relocations out regardless of current error
-	 * state.  Failure to update the relocs would mean that the next
-	 * time userland calls execbuf, it would do so with presumed offset
-	 * state that didn't match the actual object state.
-	 */
-	ret2 = i915_gem_put_relocs_to_user(exec_list, args->buffer_count,
-					   relocs);
-	if (ret2 != 0) {
-		DRM_ERROR("Failed to copy relocations back out: %d\n", ret2);
-
-		if (ret == 0)
-			ret = ret2;
-	}
-
 	drm_free_large(object_list);
 	kfree(cliprects);
 	kfree(request);