drm/i915: Emit to ringbuffer directly

This removes the usage of intel_ring_emit in favour of
directly writing to the ring buffer.

intel_ring_emit was preventing the compiler for optimising
fetch and increment of the current ring buffer pointer and
therefore generating very verbose code for every write.

It had no useful purpose since all ringbuffer operations
are started and ended with intel_ring_begin and
intel_ring_advance respectively, with no bail out in the
middle possible, so it is fine to increment the tail in
intel_ring_begin and let the code manage the pointer
itself.

Useless instruction removal amounts to approximately
two and half kilobytes of saved text on my build.

Not sure if this has any measurable performance
implications but executing a ton of useless instructions
on fast paths cannot be good.

v2:
 * Change return from intel_ring_begin to error pointer by
   popular demand.
 * Move tail increment to intel_ring_advance to enable some
   error checking.

v3:
 * Move tail advance back into intel_ring_begin.
 * Rebase and tidy.

v4:
 * Complete rebase after a few months since v3.

v5:
 * Remove unecessary cast and fix !debug compile. (Chris Wilson)

v6:
 * Make intel_ring_offset take request as well.
 * Fix recording of request postfix plus a sprinkle of asserts.
   (Chris Wilson)

v7:
 * Use intel_ring_offset to get the postfix. (Chris Wilson)
 * Convert GVT code as well.

v8:
 * Rename *out++ to *cs++.

v9:
 * Fix GVT out to cs conversion in GVT.

v10:
 * Rebase for new intel_ring_begin in selftests.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 34c0008..02e6bb2 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -10158,14 +10158,12 @@ static int intel_gen2_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-	u32 flip_mask;
-	int ret;
+	u32 flip_mask, *cs;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	cs = intel_ring_begin(req, 6);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
 
 	/* Can't queue multiple flips, so wait for the previous
 	 * one to finish before executing the next.
@@ -10174,13 +10172,12 @@ static int intel_gen2_queue_flip(struct drm_device *dev,
 		flip_mask = MI_WAIT_FOR_PLANE_B_FLIP;
 	else
 		flip_mask = MI_WAIT_FOR_PLANE_A_FLIP;
-	intel_ring_emit(ring, MI_WAIT_FOR_EVENT | flip_mask);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_DISPLAY_FLIP |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0]);
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
-	intel_ring_emit(ring, 0); /* aux display base address, unused */
+	*cs++ = MI_WAIT_FOR_EVENT | flip_mask;
+	*cs++ = MI_NOOP;
+	*cs++ = MI_DISPLAY_FLIP | MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*cs++ = fb->pitches[0];
+	*cs++ = intel_crtc->flip_work->gtt_offset;
+	*cs++ = 0; /* aux display base address, unused */
 
 	return 0;
 }
@@ -10192,26 +10189,23 @@ static int intel_gen3_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-	u32 flip_mask;
-	int ret;
+	u32 flip_mask, *cs;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	cs = intel_ring_begin(req, 6);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
 
 	if (intel_crtc->plane)
 		flip_mask = MI_WAIT_FOR_PLANE_B_FLIP;
 	else
 		flip_mask = MI_WAIT_FOR_PLANE_A_FLIP;
-	intel_ring_emit(ring, MI_WAIT_FOR_EVENT | flip_mask);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0]);
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
-	intel_ring_emit(ring, MI_NOOP);
+	*cs++ = MI_WAIT_FOR_EVENT | flip_mask;
+	*cs++ = MI_NOOP;
+	*cs++ = MI_DISPLAY_FLIP_I915 | MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*cs++ = fb->pitches[0];
+	*cs++ = intel_crtc->flip_work->gtt_offset;
+	*cs++ = MI_NOOP;
 
 	return 0;
 }
@@ -10223,25 +10217,22 @@ static int intel_gen4_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-	uint32_t pf, pipesrc;
-	int ret;
+	u32 pf, pipesrc, *cs;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	cs = intel_ring_begin(req, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
 
 	/* i965+ uses the linear or tiled offsets from the
 	 * Display Registers (which do not change across a page-flip)
 	 * so we need only reprogram the base address.
 	 */
-	intel_ring_emit(ring, MI_DISPLAY_FLIP |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0]);
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset |
-			intel_fb_modifier_to_tiling(fb->modifier));
+	*cs++ = MI_DISPLAY_FLIP | MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*cs++ = fb->pitches[0];
+	*cs++ = intel_crtc->flip_work->gtt_offset |
+		intel_fb_modifier_to_tiling(fb->modifier);
 
 	/* XXX Enabling the panel-fitter across page-flip is so far
 	 * untested on non-native modes, so ignore it for now.
@@ -10249,7 +10240,7 @@ static int intel_gen4_queue_flip(struct drm_device *dev,
 	 */
 	pf = 0;
 	pipesrc = I915_READ(PIPESRC(intel_crtc->pipe)) & 0x0fff0fff;
-	intel_ring_emit(ring, pf | pipesrc);
+	*cs++ = pf | pipesrc;
 
 	return 0;
 }
@@ -10261,21 +10252,17 @@ static int intel_gen6_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-	uint32_t pf, pipesrc;
-	int ret;
+	u32 pf, pipesrc, *cs;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	cs = intel_ring_begin(req, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
 
-	intel_ring_emit(ring, MI_DISPLAY_FLIP |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0] |
-			intel_fb_modifier_to_tiling(fb->modifier));
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
+	*cs++ = MI_DISPLAY_FLIP | MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*cs++ = fb->pitches[0] | intel_fb_modifier_to_tiling(fb->modifier);
+	*cs++ = intel_crtc->flip_work->gtt_offset;
 
 	/* Contrary to the suggestions in the documentation,
 	 * "Enable Panel Fitter" does not seem to be required when page
@@ -10285,7 +10272,7 @@ static int intel_gen6_queue_flip(struct drm_device *dev,
 	 */
 	pf = 0;
 	pipesrc = I915_READ(PIPESRC(intel_crtc->pipe)) & 0x0fff0fff;
-	intel_ring_emit(ring, pf | pipesrc);
+	*cs++ = pf | pipesrc;
 
 	return 0;
 }
@@ -10298,9 +10285,8 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 				 uint32_t flags)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct intel_ring *ring = req->ring;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-	uint32_t plane_bit = 0;
+	u32 *cs, plane_bit = 0;
 	int len, ret;
 
 	switch (intel_crtc->plane) {
@@ -10344,9 +10330,9 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, len);
-	if (ret)
-		return ret;
+	cs = intel_ring_begin(req, len);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
 
 	/* Unmask the flip-done completion message. Note that the bspec says that
 	 * we should do this for both the BCS and RCS, and that we must not unmask
@@ -10358,31 +10344,28 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 	 * to zero does lead to lockups within MI_DISPLAY_FLIP.
 	 */
 	if (req->engine->id == RCS) {
-		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-		intel_ring_emit_reg(ring, DERRMR);
-		intel_ring_emit(ring, ~(DERRMR_PIPEA_PRI_FLIP_DONE |
-					  DERRMR_PIPEB_PRI_FLIP_DONE |
-					  DERRMR_PIPEC_PRI_FLIP_DONE));
+		*cs++ = MI_LOAD_REGISTER_IMM(1);
+		*cs++ = i915_mmio_reg_offset(DERRMR);
+		*cs++ = ~(DERRMR_PIPEA_PRI_FLIP_DONE |
+			  DERRMR_PIPEB_PRI_FLIP_DONE |
+			  DERRMR_PIPEC_PRI_FLIP_DONE);
 		if (IS_GEN8(dev_priv))
-			intel_ring_emit(ring, MI_STORE_REGISTER_MEM_GEN8 |
-					      MI_SRM_LRM_GLOBAL_GTT);
+			*cs++ = MI_STORE_REGISTER_MEM_GEN8 |
+				MI_SRM_LRM_GLOBAL_GTT;
 		else
-			intel_ring_emit(ring, MI_STORE_REGISTER_MEM |
-					      MI_SRM_LRM_GLOBAL_GTT);
-		intel_ring_emit_reg(ring, DERRMR);
-		intel_ring_emit(ring,
-				i915_ggtt_offset(req->engine->scratch) + 256);
+			*cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
+		*cs++ = i915_mmio_reg_offset(DERRMR);
+		*cs++ = i915_ggtt_offset(req->engine->scratch) + 256;
 		if (IS_GEN8(dev_priv)) {
-			intel_ring_emit(ring, 0);
-			intel_ring_emit(ring, MI_NOOP);
+			*cs++ = 0;
+			*cs++ = MI_NOOP;
 		}
 	}
 
-	intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 | plane_bit);
-	intel_ring_emit(ring, fb->pitches[0] |
-			intel_fb_modifier_to_tiling(fb->modifier));
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
-	intel_ring_emit(ring, (MI_NOOP));
+	*cs++ = MI_DISPLAY_FLIP_I915 | plane_bit;
+	*cs++ = fb->pitches[0] | intel_fb_modifier_to_tiling(fb->modifier);
+	*cs++ = intel_crtc->flip_work->gtt_offset;
+	*cs++ = MI_NOOP;
 
 	return 0;
 }