Commit 89531e7d authored by Chris Wilson's avatar Chris Wilson
Browse files

drm/i915: Replace global_seqno with a hangcheck heartbeat seqno



To determine whether an engine has 'stuck', we simply check whether or
not is still on the same seqno for several seconds. To keep this simple
mechanism intact over the loss of a global seqno, we can simply add a
new global heartbeat seqno instead. As we cannot know the sequence in
which requests will then be completed, we use a primitive random number
generator instead (with a cycle long enough to not matter over an
interval of a few thousand requests between hangcheck samples).

The alternative to using a dedicated seqno on every request is to issue
a heartbeat request and query its progress through the system. Sadly
this requires us to reduce struct_mutex so that we can issue requests
without requiring that bkl.

v2: And without the extra CS_STALL for the hangcheck seqno -- we don't
need strict serialisation with what comes later, we just need to be sure
we don't write the hangcheck seqno before our batch is flushed.

Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190226094922.31617-1-chris@chris-wilson.co.uk
parent 37fc7845
Loading
Loading
Loading
Loading
+4 −3
Original line number Diff line number Diff line
@@ -1295,7 +1295,7 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
	with_intel_runtime_pm(dev_priv, wakeref) {
		for_each_engine(engine, dev_priv, id) {
			acthd[id] = intel_engine_get_active_head(engine);
			seqno[id] = intel_engine_get_seqno(engine);
			seqno[id] = intel_engine_get_hangcheck_seqno(engine);
		}

		intel_engine_get_instdone(dev_priv->engine[RCS], &instdone);
@@ -1315,8 +1315,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
	for_each_engine(engine, dev_priv, id) {
		seq_printf(m, "%s:\n", engine->name);
		seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
			   engine->hangcheck.seqno, seqno[id],
			   intel_engine_last_submit(engine),
			   engine->hangcheck.last_seqno,
			   seqno[id],
			   engine->hangcheck.next_seqno,
			   jiffies_to_msecs(jiffies -
					    engine->hangcheck.action_timestamp));

+3 −2
Original line number Diff line number Diff line
@@ -1497,10 +1497,11 @@ void intel_engine_dump(struct intel_engine_cs *engine,
	if (i915_reset_failed(engine->i915))
		drm_printf(m, "*** WEDGED ***\n");

	drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
	drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x/%x [%d ms]\n",
		   intel_engine_get_seqno(engine),
		   intel_engine_last_submit(engine),
		   engine->hangcheck.seqno,
		   engine->hangcheck.last_seqno,
		   engine->hangcheck.next_seqno,
		   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
	drm_printf(m, "\tReset count: %d (global %d)\n",
		   i915_reset_engine_count(error, engine),
+3 −3
Original line number Diff line number Diff line
@@ -133,21 +133,21 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine,
				  struct hangcheck *hc)
{
	hc->acthd = intel_engine_get_active_head(engine);
	hc->seqno = intel_engine_get_seqno(engine);
	hc->seqno = intel_engine_get_hangcheck_seqno(engine);
}

static void hangcheck_store_sample(struct intel_engine_cs *engine,
				   const struct hangcheck *hc)
{
	engine->hangcheck.acthd = hc->acthd;
	engine->hangcheck.seqno = hc->seqno;
	engine->hangcheck.last_seqno = hc->seqno;
}

static enum intel_engine_hangcheck_action
hangcheck_get_action(struct intel_engine_cs *engine,
		     const struct hangcheck *hc)
{
	if (engine->hangcheck.seqno != hc->seqno)
	if (engine->hangcheck.last_seqno != hc->seqno)
		return ENGINE_ACTIVE_SEQNO;

	if (intel_engine_is_idle(engine))
+15 −0
Original line number Diff line number Diff line
@@ -178,6 +178,12 @@ static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
		I915_GEM_HWS_INDEX_ADDR);
}

static inline u32 intel_hws_hangcheck_address(struct intel_engine_cs *engine)
{
	return (i915_ggtt_offset(engine->status_page.vma) +
		I915_GEM_HWS_HANGCHECK_ADDR);
}

static inline struct i915_priolist *to_priolist(struct rb_node *rb)
{
	return rb_entry(rb, struct i915_priolist, node);
@@ -2206,6 +2212,10 @@ static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
				  request->fence.seqno,
				  request->timeline->hwsp_offset);

	cs = gen8_emit_ggtt_write(cs,
				  intel_engine_next_hangcheck_seqno(request->engine),
				  intel_hws_hangcheck_address(request->engine));

	cs = gen8_emit_ggtt_write(cs,
				  request->global_seqno,
				  intel_hws_seqno_address(request->engine));
@@ -2230,6 +2240,11 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
				      PIPE_CONTROL_FLUSH_ENABLE |
				      PIPE_CONTROL_CS_STALL);

	cs = gen8_emit_ggtt_write_rcs(cs,
				      intel_engine_next_hangcheck_seqno(request->engine),
				      intel_hws_hangcheck_address(request->engine),
				      0);

	cs = gen8_emit_ggtt_write_rcs(cs,
				      request->global_seqno,
				      intel_hws_seqno_address(request->engine),
+34 −2
Original line number Diff line number Diff line
@@ -43,6 +43,12 @@
 */
#define LEGACY_REQUEST_SIZE 200

static inline u32 hws_hangcheck_address(struct intel_engine_cs *engine)
{
	return (i915_ggtt_offset(engine->status_page.vma) +
		I915_GEM_HWS_HANGCHECK_ADDR);
}

static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
{
	return (i915_ggtt_offset(engine->status_page.vma) +
@@ -316,6 +322,11 @@ static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
	*cs++ = rq->timeline->hwsp_offset | PIPE_CONTROL_GLOBAL_GTT;
	*cs++ = rq->fence.seqno;

	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = PIPE_CONTROL_QW_WRITE;
	*cs++ = hws_hangcheck_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);

	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
	*cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
@@ -422,6 +433,11 @@ static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
	*cs++ = rq->timeline->hwsp_offset;
	*cs++ = rq->fence.seqno;

	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_GLOBAL_GTT_IVB;
	*cs++ = hws_hangcheck_address(rq->engine);
	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);

	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = (PIPE_CONTROL_QW_WRITE |
		 PIPE_CONTROL_GLOBAL_GTT_IVB |
@@ -447,12 +463,15 @@ static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
	*cs++ = rq->fence.seqno;

	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);

	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
	*cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
	*cs++ = rq->global_seqno;

	*cs++ = MI_USER_INTERRUPT;
	*cs++ = MI_NOOP;

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);
@@ -472,6 +491,10 @@ static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
	*cs++ = rq->fence.seqno;

	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);

	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
	*cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
	*cs++ = rq->global_seqno;
@@ -487,6 +510,7 @@ static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
	*cs++ = 0;

	*cs++ = MI_USER_INTERRUPT;
	*cs++ = MI_NOOP;

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);
@@ -930,11 +954,16 @@ static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
	*cs++ = rq->fence.seqno;

	*cs++ = MI_STORE_DWORD_INDEX;
	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);

	*cs++ = MI_STORE_DWORD_INDEX;
	*cs++ = I915_GEM_HWS_INDEX_ADDR;
	*cs++ = rq->global_seqno;

	*cs++ = MI_USER_INTERRUPT;
	*cs++ = MI_NOOP;

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);
@@ -956,6 +985,10 @@ static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
	*cs++ = rq->fence.seqno;

	*cs++ = MI_STORE_DWORD_INDEX;
	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);

	BUILD_BUG_ON(GEN5_WA_STORES < 1);
	for (i = 0; i < GEN5_WA_STORES; i++) {
		*cs++ = MI_STORE_DWORD_INDEX;
@@ -964,7 +997,6 @@ static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
	}

	*cs++ = MI_USER_INTERRUPT;
	*cs++ = MI_NOOP;

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);
Loading